You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
558 lines
14 KiB
Plaintext
558 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import nltk\n",
|
|
"import random\n",
|
|
"s = ' '"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','')\n",
|
|
"text = text.split()\n",
|
|
"textSet = set(text)\n",
|
|
"tagged = nltk.pos_tag(textSet)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#create array of single sigles, in order to use them as tags for dictionaries\n",
|
|
"\n",
|
|
"sigle = '''\n",
|
|
"1. \tCC \tCoordinating conjunction\n",
|
|
"2. \tCD \tCardinal number\n",
|
|
"3. \tDT \tDeterminer\n",
|
|
"4. \tEX \tExistential there\n",
|
|
"5. \tFW \tForeign word\n",
|
|
"6. \tIN \tPreposition or subordinating conjunction\n",
|
|
"7. \tJJ \tAdjective\n",
|
|
"8. \tJJR \tAdjective, comparative\n",
|
|
"9. \tJJS \tAdjective, superlative\n",
|
|
"10. \tLS \tList item marker\n",
|
|
"11. \tMD \tModal\n",
|
|
"12. \tNN \tNoun, singular or mass\n",
|
|
"13. \tNNS \tNoun, plural\n",
|
|
"14. \tNNP \tProper noun, singular\n",
|
|
"15. \tNNPS \tProper noun, plural\n",
|
|
"16. \tPDT \tPredeterminer\n",
|
|
"17. \tPOS \tPossessive ending\n",
|
|
"18. \tPRP \tPersonal pronoun\n",
|
|
"19. \tPRP$ \tPossessive pronoun\n",
|
|
"20. \tRB \tAdverb\n",
|
|
"21. \tRBR \tAdverb, comparative\n",
|
|
"22. \tRBS \tAdverb, superlative\n",
|
|
"23. \tRP \tParticle\n",
|
|
"24. \tSYM \tSymbol\n",
|
|
"25. \tTO \tto\n",
|
|
"26. \tUH \tInterjection\n",
|
|
"27. \tVB \tVerb, base form\n",
|
|
"28. \tVBD \tVerb, past tense\n",
|
|
"29. \tVBG \tVerb, gerund or present participle\n",
|
|
"30. \tVBN \tVerb, past participle\n",
|
|
"31. \tVBP \tVerb, non-3 person singular present\n",
|
|
"32. \tVBZ \tVerb, 3 person singular present\n",
|
|
"33. \tWDT \tWh-determiner\n",
|
|
"34. \tWP \tWh-pronoun\n",
|
|
"36. \tWRB \tWh-adverb \n",
|
|
"'''\n",
|
|
"\n",
|
|
"sigle = sigle.replace('.','').split()\n",
|
|
"sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 97,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"#This is for prepare a grammar constructio based on a picked random part of text\n",
|
|
"\n",
|
|
"s = ' '\n",
|
|
"textlines = open('language.txt').readlines()\n",
|
|
"nl = len(textlines)\n",
|
|
"r = random.randrange(0,nl)\n",
|
|
"line = textlines[r]\n",
|
|
"sentence = line.split()\n",
|
|
"sentence = nltk.pos_tag(sentence)\n",
|
|
"\n",
|
|
"dat = {}\n",
|
|
"\n",
|
|
"for word, tag in sentence:\n",
|
|
" dat[tag] = word\n",
|
|
" \n",
|
|
"keys = dat.keys()\n",
|
|
"\n",
|
|
"print(\" + s + \".join([pos for pos in keys])) #copypaste the result to generate sentences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CClista = []\n",
|
|
"my_dict('CC','dataCC',CClista)\n",
|
|
"\n",
|
|
"\n",
|
|
"CDlista = []\n",
|
|
"my_dict('CD','dataCD',CDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"DTlista = []\n",
|
|
"my_dict('DT','dataDT',DTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"EXlista = []\n",
|
|
"my_dict('EX','dataEX',EXlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"FWlista = []\n",
|
|
"my_dict('FW','dataFW',FWlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"INlista = []\n",
|
|
"my_dict('IN','dataIN',INlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJlista = []\n",
|
|
"my_dict('JJ','dataJJ',JJlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJRlista = []\n",
|
|
"my_dict('JJR','dataJJR',JJRlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJSlista = []\n",
|
|
"my_dict('JJS','dataJJS',JJSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"LSlista = []\n",
|
|
"my_dict('LS','dataLS',LSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"MDlista = []\n",
|
|
"my_dict('MD','dataMD',MDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNlista = []\n",
|
|
"my_dict('NN','dataNN',NNlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNSlista = []\n",
|
|
"my_dict('NNS','dataNNS',NNSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNPlista = []\n",
|
|
"my_dict('NNP','dataNNP',NNPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"PDTlista = []\n",
|
|
"my_dict('PDT','dataPDT',PDTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"POSlista = []\n",
|
|
"my_dict('POS','dataPOS',POSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"PRPlista = []\n",
|
|
"my_dict('PRP','dataPRP',PRPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBlista = []\n",
|
|
"my_dict('RB','dataRB',RBlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBRlista = []\n",
|
|
"my_dict('RBR','dataRBR',RBRlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBSlista = []\n",
|
|
"my_dict('RBS','dataRBS',RBSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RPlista = []\n",
|
|
"my_dict('RP','dataRP',RPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"SYMlista = []\n",
|
|
"my_dict('SYM','dataSYM',SYMlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"TOlista = []\n",
|
|
"my_dict('TO','dataTO',TOlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"UHlista = []\n",
|
|
"my_dict('UH','dataUH',UHlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBlista = []\n",
|
|
"my_dict('VB','dataVB',VBlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBDlista = []\n",
|
|
"my_dict('VBD','dataVBD',VBDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBGlista = []\n",
|
|
"my_dict('VBG','dataVBG',VBGlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBNlista = []\n",
|
|
"my_dict('VBN','dataVBN',VBNlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBPlista = []\n",
|
|
"my_dict('VBP','dataVBP',VBPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBZlista = []\n",
|
|
"my_dict('VBZ','dataVBZ',VBZlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WDTlista = []\n",
|
|
"my_dict('WDT','dataWDT',WDTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WPlista = []\n",
|
|
"my_dict('WP','dataWP',WPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WRBlista = []\n",
|
|
"my_dict('WRB','dataWRB',WRBlista)\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for gr in sigle: #to create storing list for various pos\n",
|
|
" print(f'{gr}lista = []'.replace('$','ç')) \n",
|
|
" print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç'))\n",
|
|
" print('\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 52,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"CC = random.choice(CClista)\n",
|
|
"CD = random.choice(CDlista)\n",
|
|
"DT = random.choice(DTlista)\n",
|
|
"EX = random.choice(EXlista)\n",
|
|
"IN = random.choice(INlista)\n",
|
|
"JJ = random.choice(JJlista)\n",
|
|
"JJS = random.choice(JJSlista)\n",
|
|
"MD = random.choice(MDlista)\n",
|
|
"NN = random.choice(NNlista)\n",
|
|
"NNS = random.choice(NNSlista)\n",
|
|
"NNP = random.choice(NNPlista)\n",
|
|
"PRP = random.choice(PRPlista)\n",
|
|
"RB = random.choice(RBlista)\n",
|
|
"TO = random.choice(TOlista)\n",
|
|
"VB = random.choice(VBlista)\n",
|
|
"VBD = random.choice(VBDlista)\n",
|
|
"VBG = random.choice(VBGlista)\n",
|
|
"VBN = random.choice(VBNlista)\n",
|
|
"VBP = random.choice(VBPlista)\n",
|
|
"VBZ = random.choice(VBZlista)\n",
|
|
"WDT = random.choice(WDTlista)\n",
|
|
"WP = random.choice(WPlista)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"dataset = {} \n",
|
|
"\n",
|
|
"def my_dict(gr,data,grlista): #to store words in pos lists\n",
|
|
"\n",
|
|
" data = {}\n",
|
|
"\n",
|
|
" for word, tag in tagged:\n",
|
|
" dataset[tag] = word\n",
|
|
" if tag == gr:\n",
|
|
" data[tag] = word\n",
|
|
" grlista.append(word)\n",
|
|
" for x in data:\n",
|
|
" if len(grlista) == 0:\n",
|
|
" None\n",
|
|
" else:\n",
|
|
" print(f'{gr} = random.choice({gr}lista)'.replace('$','ç')) #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos\n",
|
|
" \n",
|
|
"#copy paste from up:\n",
|
|
" \n",
|
|
" \n",
|
|
"CClista = []\n",
|
|
"my_dict('CC','dataCC',CClista)\n",
|
|
"\n",
|
|
"\n",
|
|
"CDlista = []\n",
|
|
"my_dict('CD','dataCD',CDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"DTlista = []\n",
|
|
"my_dict('DT','dataDT',DTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"EXlista = []\n",
|
|
"my_dict('EX','dataEX',EXlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"FWlista = []\n",
|
|
"my_dict('FW','dataFW',FWlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"INlista = []\n",
|
|
"my_dict('IN','dataIN',INlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJlista = []\n",
|
|
"my_dict('JJ','dataJJ',JJlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJRlista = []\n",
|
|
"my_dict('JJR','dataJJR',JJRlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"JJSlista = []\n",
|
|
"my_dict('JJS','dataJJS',JJSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"LSlista = []\n",
|
|
"my_dict('LS','dataLS',LSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"MDlista = []\n",
|
|
"my_dict('MD','dataMD',MDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNlista = []\n",
|
|
"my_dict('NN','dataNN',NNlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNSlista = []\n",
|
|
"my_dict('NNS','dataNNS',NNSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"NNPlista = []\n",
|
|
"my_dict('NNP','dataNNP',NNPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"PDTlista = []\n",
|
|
"my_dict('PDT','dataPDT',PDTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"POSlista = []\n",
|
|
"my_dict('POS','dataPOS',POSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"PRPlista = []\n",
|
|
"my_dict('PRP','dataPRP',PRPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBlista = []\n",
|
|
"my_dict('RB','dataRB',RBlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBRlista = []\n",
|
|
"my_dict('RBR','dataRBR',RBRlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RBSlista = []\n",
|
|
"my_dict('RBS','dataRBS',RBSlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"RPlista = []\n",
|
|
"my_dict('RP','dataRP',RPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"SYMlista = []\n",
|
|
"my_dict('SYM','dataSYM',SYMlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"TOlista = []\n",
|
|
"my_dict('TO','dataTO',TOlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"UHlista = []\n",
|
|
"my_dict('UH','dataUH',UHlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBlista = []\n",
|
|
"my_dict('VB','dataVB',VBlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBDlista = []\n",
|
|
"my_dict('VBD','dataVBD',VBDlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBGlista = []\n",
|
|
"my_dict('VBG','dataVBG',VBGlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBNlista = []\n",
|
|
"my_dict('VBN','dataVBN',VBNlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBPlista = []\n",
|
|
"my_dict('VBP','dataVBP',VBPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"VBZlista = []\n",
|
|
"my_dict('VBZ','dataVBZ',VBZlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WDTlista = []\n",
|
|
"my_dict('WDT','dataWDT',WDTlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WPlista = []\n",
|
|
"my_dict('WP','dataWP',WPlista)\n",
|
|
"\n",
|
|
"\n",
|
|
"WRBlista = []\n",
|
|
"my_dict('WRB','dataWRB',WRBlista)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 113,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#copy paste result from up in order to randomize at every refresh a new word\n",
|
|
"\n",
|
|
"CC = random.choice(CClista)\n",
|
|
"CD = random.choice(CDlista)\n",
|
|
"DT = random.choice(DTlista)\n",
|
|
"EX = random.choice(EXlista)\n",
|
|
"IN = random.choice(INlista)\n",
|
|
"JJ = random.choice(JJlista)\n",
|
|
"JJS = random.choice(JJSlista)\n",
|
|
"MD = random.choice(MDlista)\n",
|
|
"NN = random.choice(NNlista)\n",
|
|
"NNS = random.choice(NNSlista)\n",
|
|
"NNP = random.choice(NNPlista)\n",
|
|
"PRP = random.choice(PRPlista)\n",
|
|
"RB = random.choice(RBlista)\n",
|
|
"TO = random.choice(TOlista)\n",
|
|
"VB = random.choice(VBlista)\n",
|
|
"VBD = random.choice(VBDlista)\n",
|
|
"VBG = random.choice(VBGlista)\n",
|
|
"VBN = random.choice(VBNlista)\n",
|
|
"VBP = random.choice(VBPlista)\n",
|
|
"VBZ = random.choice(VBZlista)\n",
|
|
"WDT = random.choice(WDTlista)\n",
|
|
"WP = random.choice(WPlista)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 114,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'however process is so-called layers since these which There are implemented and Yet'"
|
|
]
|
|
},
|
|
"execution_count": 114,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|