You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

14 KiB

In [48]:
import nltk
import random
s = ' '
In [49]:
text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','')
text = text.split()
textSet = set(text)
tagged = nltk.pos_tag(textSet)
In [50]:
#create array of single sigles, in order to use them as tags for dictionaries

sigle = '''
1. 	CC 	Coordinating conjunction
2. 	CD 	Cardinal number
3. 	DT 	Determiner
4. 	EX 	Existential there
5. 	FW 	Foreign word
6. 	IN 	Preposition or subordinating conjunction
7. 	JJ 	Adjective
8. 	JJR 	Adjective, comparative
9. 	JJS 	Adjective, superlative
10. 	LS 	List item marker
11. 	MD 	Modal
12. 	NN 	Noun, singular or mass
13. 	NNS 	Noun, plural
14. 	NNP 	Proper noun, singular
15. 	NNPS 	Proper noun, plural
16. 	PDT 	Predeterminer
17. 	POS 	Possessive ending
18. 	PRP 	Personal pronoun
19. 	PRP$ 	Possessive pronoun
20. 	RB 	Adverb
21. 	RBR 	Adverb, comparative
22. 	RBS 	Adverb, superlative
23. 	RP 	Particle
24. 	SYM 	Symbol
25. 	TO 	to
26. 	UH 	Interjection
27. 	VB 	Verb, base form
28. 	VBD 	Verb, past tense
29. 	VBG 	Verb, gerund or present participle
30. 	VBN 	Verb, past participle
31. 	VBP 	Verb, non-3  person singular present
32. 	VBZ 	Verb, 3  person singular present
33. 	WDT 	Wh-determiner
34. 	WP 	Wh-pronoun
36. 	WRB 	Wh-adverb 
'''

sigle = sigle.replace('.','').split()
sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']
In [97]:
#This is for prepare a grammar constructio based on a picked random part of text

s = ' '
textlines = open('language.txt').readlines()
nl = len(textlines)
r = random.randrange(0,nl)
line = textlines[r]
sentence = line.split()
sentence = nltk.pos_tag(sentence)

dat = {}

for word, tag in sentence:
    dat[tag] = word
    
keys = dat.keys()

print(" + s + ".join([pos for pos in keys])) #copypaste the result to generate sentences
RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP
In [51]:
for gr in sigle: #to create storing list for various pos
    print(f'{gr}lista = []'.replace('$','ç')) 
    print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç'))
    print('\n')
CClista = []
my_dict('CC','dataCC',CClista)


CDlista = []
my_dict('CD','dataCD',CDlista)


DTlista = []
my_dict('DT','dataDT',DTlista)


EXlista = []
my_dict('EX','dataEX',EXlista)


FWlista = []
my_dict('FW','dataFW',FWlista)


INlista = []
my_dict('IN','dataIN',INlista)


JJlista = []
my_dict('JJ','dataJJ',JJlista)


JJRlista = []
my_dict('JJR','dataJJR',JJRlista)


JJSlista = []
my_dict('JJS','dataJJS',JJSlista)


LSlista = []
my_dict('LS','dataLS',LSlista)


MDlista = []
my_dict('MD','dataMD',MDlista)


NNlista = []
my_dict('NN','dataNN',NNlista)


NNSlista = []
my_dict('NNS','dataNNS',NNSlista)


NNPlista = []
my_dict('NNP','dataNNP',NNPlista)


PDTlista = []
my_dict('PDT','dataPDT',PDTlista)


POSlista = []
my_dict('POS','dataPOS',POSlista)


PRPlista = []
my_dict('PRP','dataPRP',PRPlista)


RBlista = []
my_dict('RB','dataRB',RBlista)


RBRlista = []
my_dict('RBR','dataRBR',RBRlista)


RBSlista = []
my_dict('RBS','dataRBS',RBSlista)


RPlista = []
my_dict('RP','dataRP',RPlista)


SYMlista = []
my_dict('SYM','dataSYM',SYMlista)


TOlista = []
my_dict('TO','dataTO',TOlista)


UHlista = []
my_dict('UH','dataUH',UHlista)


VBlista = []
my_dict('VB','dataVB',VBlista)


VBDlista = []
my_dict('VBD','dataVBD',VBDlista)


VBGlista = []
my_dict('VBG','dataVBG',VBGlista)


VBNlista = []
my_dict('VBN','dataVBN',VBNlista)


VBPlista = []
my_dict('VBP','dataVBP',VBPlista)


VBZlista = []
my_dict('VBZ','dataVBZ',VBZlista)


WDTlista = []
my_dict('WDT','dataWDT',WDTlista)


WPlista = []
my_dict('WP','dataWP',WPlista)


WRBlista = []
my_dict('WRB','dataWRB',WRBlista)


In [52]:
dataset = {} 

def my_dict(gr,data,grlista): #to store words in pos lists

    data = {}

    for word, tag in tagged:
        dataset[tag] = word
        if tag == gr:
            data[tag] = word
            grlista.append(word)
    for x in data:
        if len(grlista) == 0:
            None
        else:
            print(f'{gr} = random.choice({gr}lista)'.replace('$','ç'))            #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos
            
#copy paste from up:
            
            
CClista = []
my_dict('CC','dataCC',CClista)


CDlista = []
my_dict('CD','dataCD',CDlista)


DTlista = []
my_dict('DT','dataDT',DTlista)


EXlista = []
my_dict('EX','dataEX',EXlista)


FWlista = []
my_dict('FW','dataFW',FWlista)


INlista = []
my_dict('IN','dataIN',INlista)


JJlista = []
my_dict('JJ','dataJJ',JJlista)


JJRlista = []
my_dict('JJR','dataJJR',JJRlista)


JJSlista = []
my_dict('JJS','dataJJS',JJSlista)


LSlista = []
my_dict('LS','dataLS',LSlista)


MDlista = []
my_dict('MD','dataMD',MDlista)


NNlista = []
my_dict('NN','dataNN',NNlista)


NNSlista = []
my_dict('NNS','dataNNS',NNSlista)


NNPlista = []
my_dict('NNP','dataNNP',NNPlista)


PDTlista = []
my_dict('PDT','dataPDT',PDTlista)


POSlista = []
my_dict('POS','dataPOS',POSlista)


PRPlista = []
my_dict('PRP','dataPRP',PRPlista)


RBlista = []
my_dict('RB','dataRB',RBlista)


RBRlista = []
my_dict('RBR','dataRBR',RBRlista)


RBSlista = []
my_dict('RBS','dataRBS',RBSlista)


RPlista = []
my_dict('RP','dataRP',RPlista)


SYMlista = []
my_dict('SYM','dataSYM',SYMlista)


TOlista = []
my_dict('TO','dataTO',TOlista)


UHlista = []
my_dict('UH','dataUH',UHlista)


VBlista = []
my_dict('VB','dataVB',VBlista)


VBDlista = []
my_dict('VBD','dataVBD',VBDlista)


VBGlista = []
my_dict('VBG','dataVBG',VBGlista)


VBNlista = []
my_dict('VBN','dataVBN',VBNlista)


VBPlista = []
my_dict('VBP','dataVBP',VBPlista)


VBZlista = []
my_dict('VBZ','dataVBZ',VBZlista)


WDTlista = []
my_dict('WDT','dataWDT',WDTlista)


WPlista = []
my_dict('WP','dataWP',WPlista)


WRBlista = []
my_dict('WRB','dataWRB',WRBlista)
CC = random.choice(CClista)
CD = random.choice(CDlista)
DT = random.choice(DTlista)
EX = random.choice(EXlista)
IN = random.choice(INlista)
JJ = random.choice(JJlista)
JJS = random.choice(JJSlista)
MD = random.choice(MDlista)
NN = random.choice(NNlista)
NNS = random.choice(NNSlista)
NNP = random.choice(NNPlista)
PRP = random.choice(PRPlista)
RB = random.choice(RBlista)
TO = random.choice(TOlista)
VB = random.choice(VBlista)
VBD = random.choice(VBDlista)
VBG = random.choice(VBGlista)
VBN = random.choice(VBNlista)
VBP = random.choice(VBPlista)
VBZ = random.choice(VBZlista)
WDT = random.choice(WDTlista)
WP = random.choice(WPlista)
In [113]:
#copy paste result from up in order to randomize at every refresh a new word

CC = random.choice(CClista)
CD = random.choice(CDlista)
DT = random.choice(DTlista)
EX = random.choice(EXlista)
IN = random.choice(INlista)
JJ = random.choice(JJlista)
JJS = random.choice(JJSlista)
MD = random.choice(MDlista)
NN = random.choice(NNlista)
NNS = random.choice(NNSlista)
NNP = random.choice(NNPlista)
PRP = random.choice(PRPlista)
RB = random.choice(RBlista)
TO = random.choice(TOlista)
VB = random.choice(VBlista)
VBD = random.choice(VBDlista)
VBG = random.choice(VBGlista)
VBN = random.choice(VBNlista)
VBP = random.choice(VBPlista)
VBZ = random.choice(VBZlista)
WDT = random.choice(WDTlista)
WP = random.choice(WPlista)
In [ ]:
 
In [ ]:
 
In [114]:
RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP
Out[114]:
'however process is so-called layers since these which There are implemented and Yet'
In [ ]:
 
In [ ]:
 
In [ ]: