You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
14 KiB
14 KiB
In [48]:
import nltk import random s = ' '
In [49]:
text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','') text = text.split() textSet = set(text) tagged = nltk.pos_tag(textSet)
In [50]:
#create array of single sigles, in order to use them as tags for dictionaries sigle = ''' 1. CC Coordinating conjunction 2. CD Cardinal number 3. DT Determiner 4. EX Existential there 5. FW Foreign word 6. IN Preposition or subordinating conjunction 7. JJ Adjective 8. JJR Adjective, comparative 9. JJS Adjective, superlative 10. LS List item marker 11. MD Modal 12. NN Noun, singular or mass 13. NNS Noun, plural 14. NNP Proper noun, singular 15. NNPS Proper noun, plural 16. PDT Predeterminer 17. POS Possessive ending 18. PRP Personal pronoun 19. PRP$ Possessive pronoun 20. RB Adverb 21. RBR Adverb, comparative 22. RBS Adverb, superlative 23. RP Particle 24. SYM Symbol 25. TO to 26. UH Interjection 27. VB Verb, base form 28. VBD Verb, past tense 29. VBG Verb, gerund or present participle 30. VBN Verb, past participle 31. VBP Verb, non-3 person singular present 32. VBZ Verb, 3 person singular present 33. WDT Wh-determiner 34. WP Wh-pronoun 36. WRB Wh-adverb ''' sigle = sigle.replace('.','').split() sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']
In [97]:
#This is for prepare a grammar constructio based on a picked random part of text s = ' ' textlines = open('language.txt').readlines() nl = len(textlines) r = random.randrange(0,nl) line = textlines[r] sentence = line.split() sentence = nltk.pos_tag(sentence) dat = {} for word, tag in sentence: dat[tag] = word keys = dat.keys() print(" + s + ".join([pos for pos in keys])) #copypaste the result to generate sentences
RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP
In [51]:
for gr in sigle: #to create storing list for various pos print(f'{gr}lista = []'.replace('$','ç')) print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç')) print('\n')
CClista = [] my_dict('CC','dataCC',CClista) CDlista = [] my_dict('CD','dataCD',CDlista) DTlista = [] my_dict('DT','dataDT',DTlista) EXlista = [] my_dict('EX','dataEX',EXlista) FWlista = [] my_dict('FW','dataFW',FWlista) INlista = [] my_dict('IN','dataIN',INlista) JJlista = [] my_dict('JJ','dataJJ',JJlista) JJRlista = [] my_dict('JJR','dataJJR',JJRlista) JJSlista = [] my_dict('JJS','dataJJS',JJSlista) LSlista = [] my_dict('LS','dataLS',LSlista) MDlista = [] my_dict('MD','dataMD',MDlista) NNlista = [] my_dict('NN','dataNN',NNlista) NNSlista = [] my_dict('NNS','dataNNS',NNSlista) NNPlista = [] my_dict('NNP','dataNNP',NNPlista) PDTlista = [] my_dict('PDT','dataPDT',PDTlista) POSlista = [] my_dict('POS','dataPOS',POSlista) PRPlista = [] my_dict('PRP','dataPRP',PRPlista) RBlista = [] my_dict('RB','dataRB',RBlista) RBRlista = [] my_dict('RBR','dataRBR',RBRlista) RBSlista = [] my_dict('RBS','dataRBS',RBSlista) RPlista = [] my_dict('RP','dataRP',RPlista) SYMlista = [] my_dict('SYM','dataSYM',SYMlista) TOlista = [] my_dict('TO','dataTO',TOlista) UHlista = [] my_dict('UH','dataUH',UHlista) VBlista = [] my_dict('VB','dataVB',VBlista) VBDlista = [] my_dict('VBD','dataVBD',VBDlista) VBGlista = [] my_dict('VBG','dataVBG',VBGlista) VBNlista = [] my_dict('VBN','dataVBN',VBNlista) VBPlista = [] my_dict('VBP','dataVBP',VBPlista) VBZlista = [] my_dict('VBZ','dataVBZ',VBZlista) WDTlista = [] my_dict('WDT','dataWDT',WDTlista) WPlista = [] my_dict('WP','dataWP',WPlista) WRBlista = [] my_dict('WRB','dataWRB',WRBlista)
In [52]:
dataset = {} def my_dict(gr,data,grlista): #to store words in pos lists data = {} for word, tag in tagged: dataset[tag] = word if tag == gr: data[tag] = word grlista.append(word) for x in data: if len(grlista) == 0: None else: print(f'{gr} = random.choice({gr}lista)'.replace('$','ç')) #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos #copy paste from up: CClista = [] my_dict('CC','dataCC',CClista) CDlista = [] my_dict('CD','dataCD',CDlista) DTlista = [] my_dict('DT','dataDT',DTlista) EXlista = [] my_dict('EX','dataEX',EXlista) FWlista = [] my_dict('FW','dataFW',FWlista) INlista = [] my_dict('IN','dataIN',INlista) JJlista = [] my_dict('JJ','dataJJ',JJlista) JJRlista = [] my_dict('JJR','dataJJR',JJRlista) JJSlista = [] my_dict('JJS','dataJJS',JJSlista) LSlista = [] my_dict('LS','dataLS',LSlista) MDlista = [] my_dict('MD','dataMD',MDlista) NNlista = [] my_dict('NN','dataNN',NNlista) NNSlista = [] my_dict('NNS','dataNNS',NNSlista) NNPlista = [] my_dict('NNP','dataNNP',NNPlista) PDTlista = [] my_dict('PDT','dataPDT',PDTlista) POSlista = [] my_dict('POS','dataPOS',POSlista) PRPlista = [] my_dict('PRP','dataPRP',PRPlista) RBlista = [] my_dict('RB','dataRB',RBlista) RBRlista = [] my_dict('RBR','dataRBR',RBRlista) RBSlista = [] my_dict('RBS','dataRBS',RBSlista) RPlista = [] my_dict('RP','dataRP',RPlista) SYMlista = [] my_dict('SYM','dataSYM',SYMlista) TOlista = [] my_dict('TO','dataTO',TOlista) UHlista = [] my_dict('UH','dataUH',UHlista) VBlista = [] my_dict('VB','dataVB',VBlista) VBDlista = [] my_dict('VBD','dataVBD',VBDlista) VBGlista = [] my_dict('VBG','dataVBG',VBGlista) VBNlista = [] my_dict('VBN','dataVBN',VBNlista) VBPlista = [] my_dict('VBP','dataVBP',VBPlista) VBZlista = [] my_dict('VBZ','dataVBZ',VBZlista) WDTlista = [] my_dict('WDT','dataWDT',WDTlista) WPlista = [] my_dict('WP','dataWP',WPlista) WRBlista = [] my_dict('WRB','dataWRB',WRBlista)
CC = random.choice(CClista) CD = random.choice(CDlista) DT = random.choice(DTlista) EX = random.choice(EXlista) IN = random.choice(INlista) JJ = random.choice(JJlista) JJS = random.choice(JJSlista) MD = random.choice(MDlista) NN = random.choice(NNlista) NNS = random.choice(NNSlista) NNP = random.choice(NNPlista) PRP = random.choice(PRPlista) RB = random.choice(RBlista) TO = random.choice(TOlista) VB = random.choice(VBlista) VBD = random.choice(VBDlista) VBG = random.choice(VBGlista) VBN = random.choice(VBNlista) VBP = random.choice(VBPlista) VBZ = random.choice(VBZlista) WDT = random.choice(WDTlista) WP = random.choice(WPlista)
In [113]:
#copy paste result from up in order to randomize at every refresh a new word CC = random.choice(CClista) CD = random.choice(CDlista) DT = random.choice(DTlista) EX = random.choice(EXlista) IN = random.choice(INlista) JJ = random.choice(JJlista) JJS = random.choice(JJSlista) MD = random.choice(MDlista) NN = random.choice(NNlista) NNS = random.choice(NNSlista) NNP = random.choice(NNPlista) PRP = random.choice(PRPlista) RB = random.choice(RBlista) TO = random.choice(TOlista) VB = random.choice(VBlista) VBD = random.choice(VBDlista) VBG = random.choice(VBGlista) VBN = random.choice(VBNlista) VBP = random.choice(VBPlista) VBZ = random.choice(VBZlista) WDT = random.choice(WDTlista) WP = random.choice(WPlista)
In [ ]:
In [ ]:
In [114]:
RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP
Out[114]:
'however process is so-called layers since these which There are implemented and Yet'
In [ ]:
In [ ]:
In [ ]: