You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

558 lines
14 KiB
Plaintext

4 years ago
{
"cells": [
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import nltk\n",
"import random\n",
"s = ' '"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','')\n",
"text = text.split()\n",
"textSet = set(text)\n",
"tagged = nltk.pos_tag(textSet)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"#create array of single sigles, in order to use them as tags for dictionaries\n",
"\n",
"sigle = '''\n",
"1. \tCC \tCoordinating conjunction\n",
"2. \tCD \tCardinal number\n",
"3. \tDT \tDeterminer\n",
"4. \tEX \tExistential there\n",
"5. \tFW \tForeign word\n",
"6. \tIN \tPreposition or subordinating conjunction\n",
"7. \tJJ \tAdjective\n",
"8. \tJJR \tAdjective, comparative\n",
"9. \tJJS \tAdjective, superlative\n",
"10. \tLS \tList item marker\n",
"11. \tMD \tModal\n",
"12. \tNN \tNoun, singular or mass\n",
"13. \tNNS \tNoun, plural\n",
"14. \tNNP \tProper noun, singular\n",
"15. \tNNPS \tProper noun, plural\n",
"16. \tPDT \tPredeterminer\n",
"17. \tPOS \tPossessive ending\n",
"18. \tPRP \tPersonal pronoun\n",
"19. \tPRP$ \tPossessive pronoun\n",
"20. \tRB \tAdverb\n",
"21. \tRBR \tAdverb, comparative\n",
"22. \tRBS \tAdverb, superlative\n",
"23. \tRP \tParticle\n",
"24. \tSYM \tSymbol\n",
"25. \tTO \tto\n",
"26. \tUH \tInterjection\n",
"27. \tVB \tVerb, base form\n",
"28. \tVBD \tVerb, past tense\n",
"29. \tVBG \tVerb, gerund or present participle\n",
"30. \tVBN \tVerb, past participle\n",
"31. \tVBP \tVerb, non-3 person singular present\n",
"32. \tVBZ \tVerb, 3 person singular present\n",
"33. \tWDT \tWh-determiner\n",
"34. \tWP \tWh-pronoun\n",
"36. \tWRB \tWh-adverb \n",
"'''\n",
"\n",
"sigle = sigle.replace('.','').split()\n",
"sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP\n"
]
}
],
"source": [
"#This is for prepare a grammar constructio based on a picked random part of text\n",
"\n",
"s = ' '\n",
"textlines = open('language.txt').readlines()\n",
"nl = len(textlines)\n",
"r = random.randrange(0,nl)\n",
"line = textlines[r]\n",
"sentence = line.split()\n",
"sentence = nltk.pos_tag(sentence)\n",
"\n",
"dat = {}\n",
"\n",
"for word, tag in sentence:\n",
" dat[tag] = word\n",
" \n",
"keys = dat.keys()\n",
"\n",
"print(\" + s + \".join([pos for pos in keys])) #copypaste the result to generate sentences"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CClista = []\n",
"my_dict('CC','dataCC',CClista)\n",
"\n",
"\n",
"CDlista = []\n",
"my_dict('CD','dataCD',CDlista)\n",
"\n",
"\n",
"DTlista = []\n",
"my_dict('DT','dataDT',DTlista)\n",
"\n",
"\n",
"EXlista = []\n",
"my_dict('EX','dataEX',EXlista)\n",
"\n",
"\n",
"FWlista = []\n",
"my_dict('FW','dataFW',FWlista)\n",
"\n",
"\n",
"INlista = []\n",
"my_dict('IN','dataIN',INlista)\n",
"\n",
"\n",
"JJlista = []\n",
"my_dict('JJ','dataJJ',JJlista)\n",
"\n",
"\n",
"JJRlista = []\n",
"my_dict('JJR','dataJJR',JJRlista)\n",
"\n",
"\n",
"JJSlista = []\n",
"my_dict('JJS','dataJJS',JJSlista)\n",
"\n",
"\n",
"LSlista = []\n",
"my_dict('LS','dataLS',LSlista)\n",
"\n",
"\n",
"MDlista = []\n",
"my_dict('MD','dataMD',MDlista)\n",
"\n",
"\n",
"NNlista = []\n",
"my_dict('NN','dataNN',NNlista)\n",
"\n",
"\n",
"NNSlista = []\n",
"my_dict('NNS','dataNNS',NNSlista)\n",
"\n",
"\n",
"NNPlista = []\n",
"my_dict('NNP','dataNNP',NNPlista)\n",
"\n",
"\n",
"PDTlista = []\n",
"my_dict('PDT','dataPDT',PDTlista)\n",
"\n",
"\n",
"POSlista = []\n",
"my_dict('POS','dataPOS',POSlista)\n",
"\n",
"\n",
"PRPlista = []\n",
"my_dict('PRP','dataPRP',PRPlista)\n",
"\n",
"\n",
"RBlista = []\n",
"my_dict('RB','dataRB',RBlista)\n",
"\n",
"\n",
"RBRlista = []\n",
"my_dict('RBR','dataRBR',RBRlista)\n",
"\n",
"\n",
"RBSlista = []\n",
"my_dict('RBS','dataRBS',RBSlista)\n",
"\n",
"\n",
"RPlista = []\n",
"my_dict('RP','dataRP',RPlista)\n",
"\n",
"\n",
"SYMlista = []\n",
"my_dict('SYM','dataSYM',SYMlista)\n",
"\n",
"\n",
"TOlista = []\n",
"my_dict('TO','dataTO',TOlista)\n",
"\n",
"\n",
"UHlista = []\n",
"my_dict('UH','dataUH',UHlista)\n",
"\n",
"\n",
"VBlista = []\n",
"my_dict('VB','dataVB',VBlista)\n",
"\n",
"\n",
"VBDlista = []\n",
"my_dict('VBD','dataVBD',VBDlista)\n",
"\n",
"\n",
"VBGlista = []\n",
"my_dict('VBG','dataVBG',VBGlista)\n",
"\n",
"\n",
"VBNlista = []\n",
"my_dict('VBN','dataVBN',VBNlista)\n",
"\n",
"\n",
"VBPlista = []\n",
"my_dict('VBP','dataVBP',VBPlista)\n",
"\n",
"\n",
"VBZlista = []\n",
"my_dict('VBZ','dataVBZ',VBZlista)\n",
"\n",
"\n",
"WDTlista = []\n",
"my_dict('WDT','dataWDT',WDTlista)\n",
"\n",
"\n",
"WPlista = []\n",
"my_dict('WP','dataWP',WPlista)\n",
"\n",
"\n",
"WRBlista = []\n",
"my_dict('WRB','dataWRB',WRBlista)\n",
"\n",
"\n"
]
}
],
"source": [
"for gr in sigle: #to create storing list for various pos\n",
" print(f'{gr}lista = []'.replace('$','ç')) \n",
" print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç'))\n",
" print('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CC = random.choice(CClista)\n",
"CD = random.choice(CDlista)\n",
"DT = random.choice(DTlista)\n",
"EX = random.choice(EXlista)\n",
"IN = random.choice(INlista)\n",
"JJ = random.choice(JJlista)\n",
"JJS = random.choice(JJSlista)\n",
"MD = random.choice(MDlista)\n",
"NN = random.choice(NNlista)\n",
"NNS = random.choice(NNSlista)\n",
"NNP = random.choice(NNPlista)\n",
"PRP = random.choice(PRPlista)\n",
"RB = random.choice(RBlista)\n",
"TO = random.choice(TOlista)\n",
"VB = random.choice(VBlista)\n",
"VBD = random.choice(VBDlista)\n",
"VBG = random.choice(VBGlista)\n",
"VBN = random.choice(VBNlista)\n",
"VBP = random.choice(VBPlista)\n",
"VBZ = random.choice(VBZlista)\n",
"WDT = random.choice(WDTlista)\n",
"WP = random.choice(WPlista)\n"
]
}
],
"source": [
"dataset = {} \n",
"\n",
"def my_dict(gr,data,grlista): #to store words in pos lists\n",
"\n",
" data = {}\n",
"\n",
" for word, tag in tagged:\n",
" dataset[tag] = word\n",
" if tag == gr:\n",
" data[tag] = word\n",
" grlista.append(word)\n",
" for x in data:\n",
" if len(grlista) == 0:\n",
" None\n",
" else:\n",
" print(f'{gr} = random.choice({gr}lista)'.replace('$','ç')) #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos\n",
" \n",
"#copy paste from up:\n",
" \n",
" \n",
"CClista = []\n",
"my_dict('CC','dataCC',CClista)\n",
"\n",
"\n",
"CDlista = []\n",
"my_dict('CD','dataCD',CDlista)\n",
"\n",
"\n",
"DTlista = []\n",
"my_dict('DT','dataDT',DTlista)\n",
"\n",
"\n",
"EXlista = []\n",
"my_dict('EX','dataEX',EXlista)\n",
"\n",
"\n",
"FWlista = []\n",
"my_dict('FW','dataFW',FWlista)\n",
"\n",
"\n",
"INlista = []\n",
"my_dict('IN','dataIN',INlista)\n",
"\n",
"\n",
"JJlista = []\n",
"my_dict('JJ','dataJJ',JJlista)\n",
"\n",
"\n",
"JJRlista = []\n",
"my_dict('JJR','dataJJR',JJRlista)\n",
"\n",
"\n",
"JJSlista = []\n",
"my_dict('JJS','dataJJS',JJSlista)\n",
"\n",
"\n",
"LSlista = []\n",
"my_dict('LS','dataLS',LSlista)\n",
"\n",
"\n",
"MDlista = []\n",
"my_dict('MD','dataMD',MDlista)\n",
"\n",
"\n",
"NNlista = []\n",
"my_dict('NN','dataNN',NNlista)\n",
"\n",
"\n",
"NNSlista = []\n",
"my_dict('NNS','dataNNS',NNSlista)\n",
"\n",
"\n",
"NNPlista = []\n",
"my_dict('NNP','dataNNP',NNPlista)\n",
"\n",
"\n",
"PDTlista = []\n",
"my_dict('PDT','dataPDT',PDTlista)\n",
"\n",
"\n",
"POSlista = []\n",
"my_dict('POS','dataPOS',POSlista)\n",
"\n",
"\n",
"PRPlista = []\n",
"my_dict('PRP','dataPRP',PRPlista)\n",
"\n",
"\n",
"RBlista = []\n",
"my_dict('RB','dataRB',RBlista)\n",
"\n",
"\n",
"RBRlista = []\n",
"my_dict('RBR','dataRBR',RBRlista)\n",
"\n",
"\n",
"RBSlista = []\n",
"my_dict('RBS','dataRBS',RBSlista)\n",
"\n",
"\n",
"RPlista = []\n",
"my_dict('RP','dataRP',RPlista)\n",
"\n",
"\n",
"SYMlista = []\n",
"my_dict('SYM','dataSYM',SYMlista)\n",
"\n",
"\n",
"TOlista = []\n",
"my_dict('TO','dataTO',TOlista)\n",
"\n",
"\n",
"UHlista = []\n",
"my_dict('UH','dataUH',UHlista)\n",
"\n",
"\n",
"VBlista = []\n",
"my_dict('VB','dataVB',VBlista)\n",
"\n",
"\n",
"VBDlista = []\n",
"my_dict('VBD','dataVBD',VBDlista)\n",
"\n",
"\n",
"VBGlista = []\n",
"my_dict('VBG','dataVBG',VBGlista)\n",
"\n",
"\n",
"VBNlista = []\n",
"my_dict('VBN','dataVBN',VBNlista)\n",
"\n",
"\n",
"VBPlista = []\n",
"my_dict('VBP','dataVBP',VBPlista)\n",
"\n",
"\n",
"VBZlista = []\n",
"my_dict('VBZ','dataVBZ',VBZlista)\n",
"\n",
"\n",
"WDTlista = []\n",
"my_dict('WDT','dataWDT',WDTlista)\n",
"\n",
"\n",
"WPlista = []\n",
"my_dict('WP','dataWP',WPlista)\n",
"\n",
"\n",
"WRBlista = []\n",
"my_dict('WRB','dataWRB',WRBlista)\n"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"#copy paste result from up in order to randomize at every refresh a new word\n",
"\n",
"CC = random.choice(CClista)\n",
"CD = random.choice(CDlista)\n",
"DT = random.choice(DTlista)\n",
"EX = random.choice(EXlista)\n",
"IN = random.choice(INlista)\n",
"JJ = random.choice(JJlista)\n",
"JJS = random.choice(JJSlista)\n",
"MD = random.choice(MDlista)\n",
"NN = random.choice(NNlista)\n",
"NNS = random.choice(NNSlista)\n",
"NNP = random.choice(NNPlista)\n",
"PRP = random.choice(PRPlista)\n",
"RB = random.choice(RBlista)\n",
"TO = random.choice(TOlista)\n",
"VB = random.choice(VBlista)\n",
"VBD = random.choice(VBDlista)\n",
"VBG = random.choice(VBGlista)\n",
"VBN = random.choice(VBNlista)\n",
"VBP = random.choice(VBPlista)\n",
"VBZ = random.choice(VBZlista)\n",
"WDT = random.choice(WDTlista)\n",
"WP = random.choice(WPlista)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'however process is so-called layers since these which There are implemented and Yet'"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}