{ "cells": [ { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "import random\n", "s = ' '" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','')\n", "text = text.split()\n", "textSet = set(text)\n", "tagged = nltk.pos_tag(textSet)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "#create array of single sigles, in order to use them as tags for dictionaries\n", "\n", "sigle = '''\n", "1. \tCC \tCoordinating conjunction\n", "2. \tCD \tCardinal number\n", "3. \tDT \tDeterminer\n", "4. \tEX \tExistential there\n", "5. \tFW \tForeign word\n", "6. \tIN \tPreposition or subordinating conjunction\n", "7. \tJJ \tAdjective\n", "8. \tJJR \tAdjective, comparative\n", "9. \tJJS \tAdjective, superlative\n", "10. \tLS \tList item marker\n", "11. \tMD \tModal\n", "12. \tNN \tNoun, singular or mass\n", "13. \tNNS \tNoun, plural\n", "14. \tNNP \tProper noun, singular\n", "15. \tNNPS \tProper noun, plural\n", "16. \tPDT \tPredeterminer\n", "17. \tPOS \tPossessive ending\n", "18. \tPRP \tPersonal pronoun\n", "19. \tPRP$ \tPossessive pronoun\n", "20. \tRB \tAdverb\n", "21. \tRBR \tAdverb, comparative\n", "22. \tRBS \tAdverb, superlative\n", "23. \tRP \tParticle\n", "24. \tSYM \tSymbol\n", "25. \tTO \tto\n", "26. \tUH \tInterjection\n", "27. \tVB \tVerb, base form\n", "28. \tVBD \tVerb, past tense\n", "29. \tVBG \tVerb, gerund or present participle\n", "30. \tVBN \tVerb, past participle\n", "31. \tVBP \tVerb, non-3 person singular present\n", "32. \tVBZ \tVerb, 3 person singular present\n", "33. \tWDT \tWh-determiner\n", "34. \tWP \tWh-pronoun\n", "36. \tWRB \tWh-adverb \n", "'''\n", "\n", "sigle = sigle.replace('.','').split()\n", "sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP\n" ] } ], "source": [ "#This is for prepare a grammar constructio based on a picked random part of text\n", "\n", "s = ' '\n", "textlines = open('language.txt').readlines()\n", "nl = len(textlines)\n", "r = random.randrange(0,nl)\n", "line = textlines[r]\n", "sentence = line.split()\n", "sentence = nltk.pos_tag(sentence)\n", "\n", "dat = {}\n", "\n", "for word, tag in sentence:\n", " dat[tag] = word\n", " \n", "keys = dat.keys()\n", "\n", "print(\" + s + \".join([pos for pos in keys])) #copypaste the result to generate sentences" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CClista = []\n", "my_dict('CC','dataCC',CClista)\n", "\n", "\n", "CDlista = []\n", "my_dict('CD','dataCD',CDlista)\n", "\n", "\n", "DTlista = []\n", "my_dict('DT','dataDT',DTlista)\n", "\n", "\n", "EXlista = []\n", "my_dict('EX','dataEX',EXlista)\n", "\n", "\n", "FWlista = []\n", "my_dict('FW','dataFW',FWlista)\n", "\n", "\n", "INlista = []\n", "my_dict('IN','dataIN',INlista)\n", "\n", "\n", "JJlista = []\n", "my_dict('JJ','dataJJ',JJlista)\n", "\n", "\n", "JJRlista = []\n", "my_dict('JJR','dataJJR',JJRlista)\n", "\n", "\n", "JJSlista = []\n", "my_dict('JJS','dataJJS',JJSlista)\n", "\n", "\n", "LSlista = []\n", "my_dict('LS','dataLS',LSlista)\n", "\n", "\n", "MDlista = []\n", "my_dict('MD','dataMD',MDlista)\n", "\n", "\n", "NNlista = []\n", "my_dict('NN','dataNN',NNlista)\n", "\n", "\n", "NNSlista = []\n", "my_dict('NNS','dataNNS',NNSlista)\n", "\n", "\n", "NNPlista = []\n", "my_dict('NNP','dataNNP',NNPlista)\n", "\n", "\n", "PDTlista = []\n", "my_dict('PDT','dataPDT',PDTlista)\n", "\n", "\n", "POSlista = []\n", "my_dict('POS','dataPOS',POSlista)\n", "\n", "\n", "PRPlista = []\n", "my_dict('PRP','dataPRP',PRPlista)\n", "\n", "\n", "RBlista = []\n", "my_dict('RB','dataRB',RBlista)\n", "\n", "\n", "RBRlista = []\n", "my_dict('RBR','dataRBR',RBRlista)\n", "\n", "\n", "RBSlista = []\n", "my_dict('RBS','dataRBS',RBSlista)\n", "\n", "\n", "RPlista = []\n", "my_dict('RP','dataRP',RPlista)\n", "\n", "\n", "SYMlista = []\n", "my_dict('SYM','dataSYM',SYMlista)\n", "\n", "\n", "TOlista = []\n", "my_dict('TO','dataTO',TOlista)\n", "\n", "\n", "UHlista = []\n", "my_dict('UH','dataUH',UHlista)\n", "\n", "\n", "VBlista = []\n", "my_dict('VB','dataVB',VBlista)\n", "\n", "\n", "VBDlista = []\n", "my_dict('VBD','dataVBD',VBDlista)\n", "\n", "\n", "VBGlista = []\n", "my_dict('VBG','dataVBG',VBGlista)\n", "\n", "\n", "VBNlista = []\n", "my_dict('VBN','dataVBN',VBNlista)\n", "\n", "\n", "VBPlista = []\n", "my_dict('VBP','dataVBP',VBPlista)\n", "\n", "\n", "VBZlista = []\n", "my_dict('VBZ','dataVBZ',VBZlista)\n", "\n", "\n", "WDTlista = []\n", "my_dict('WDT','dataWDT',WDTlista)\n", "\n", "\n", "WPlista = []\n", "my_dict('WP','dataWP',WPlista)\n", "\n", "\n", "WRBlista = []\n", "my_dict('WRB','dataWRB',WRBlista)\n", "\n", "\n" ] } ], "source": [ "for gr in sigle: #to create storing list for various pos\n", " print(f'{gr}lista = []'.replace('$','ç')) \n", " print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç'))\n", " print('\\n')" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CC = random.choice(CClista)\n", "CD = random.choice(CDlista)\n", "DT = random.choice(DTlista)\n", "EX = random.choice(EXlista)\n", "IN = random.choice(INlista)\n", "JJ = random.choice(JJlista)\n", "JJS = random.choice(JJSlista)\n", "MD = random.choice(MDlista)\n", "NN = random.choice(NNlista)\n", "NNS = random.choice(NNSlista)\n", "NNP = random.choice(NNPlista)\n", "PRP = random.choice(PRPlista)\n", "RB = random.choice(RBlista)\n", "TO = random.choice(TOlista)\n", "VB = random.choice(VBlista)\n", "VBD = random.choice(VBDlista)\n", "VBG = random.choice(VBGlista)\n", "VBN = random.choice(VBNlista)\n", "VBP = random.choice(VBPlista)\n", "VBZ = random.choice(VBZlista)\n", "WDT = random.choice(WDTlista)\n", "WP = random.choice(WPlista)\n" ] } ], "source": [ "dataset = {} \n", "\n", "def my_dict(gr,data,grlista): #to store words in pos lists\n", "\n", " data = {}\n", "\n", " for word, tag in tagged:\n", " dataset[tag] = word\n", " if tag == gr:\n", " data[tag] = word\n", " grlista.append(word)\n", " for x in data:\n", " if len(grlista) == 0:\n", " None\n", " else:\n", " print(f'{gr} = random.choice({gr}lista)'.replace('$','ç')) #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos\n", " \n", "#copy paste from up:\n", " \n", " \n", "CClista = []\n", "my_dict('CC','dataCC',CClista)\n", "\n", "\n", "CDlista = []\n", "my_dict('CD','dataCD',CDlista)\n", "\n", "\n", "DTlista = []\n", "my_dict('DT','dataDT',DTlista)\n", "\n", "\n", "EXlista = []\n", "my_dict('EX','dataEX',EXlista)\n", "\n", "\n", "FWlista = []\n", "my_dict('FW','dataFW',FWlista)\n", "\n", "\n", "INlista = []\n", "my_dict('IN','dataIN',INlista)\n", "\n", "\n", "JJlista = []\n", "my_dict('JJ','dataJJ',JJlista)\n", "\n", "\n", "JJRlista = []\n", "my_dict('JJR','dataJJR',JJRlista)\n", "\n", "\n", "JJSlista = []\n", "my_dict('JJS','dataJJS',JJSlista)\n", "\n", "\n", "LSlista = []\n", "my_dict('LS','dataLS',LSlista)\n", "\n", "\n", "MDlista = []\n", "my_dict('MD','dataMD',MDlista)\n", "\n", "\n", "NNlista = []\n", "my_dict('NN','dataNN',NNlista)\n", "\n", "\n", "NNSlista = []\n", "my_dict('NNS','dataNNS',NNSlista)\n", "\n", "\n", "NNPlista = []\n", "my_dict('NNP','dataNNP',NNPlista)\n", "\n", "\n", "PDTlista = []\n", "my_dict('PDT','dataPDT',PDTlista)\n", "\n", "\n", "POSlista = []\n", "my_dict('POS','dataPOS',POSlista)\n", "\n", "\n", "PRPlista = []\n", "my_dict('PRP','dataPRP',PRPlista)\n", "\n", "\n", "RBlista = []\n", "my_dict('RB','dataRB',RBlista)\n", "\n", "\n", "RBRlista = []\n", "my_dict('RBR','dataRBR',RBRlista)\n", "\n", "\n", "RBSlista = []\n", "my_dict('RBS','dataRBS',RBSlista)\n", "\n", "\n", "RPlista = []\n", "my_dict('RP','dataRP',RPlista)\n", "\n", "\n", "SYMlista = []\n", "my_dict('SYM','dataSYM',SYMlista)\n", "\n", "\n", "TOlista = []\n", "my_dict('TO','dataTO',TOlista)\n", "\n", "\n", "UHlista = []\n", "my_dict('UH','dataUH',UHlista)\n", "\n", "\n", "VBlista = []\n", "my_dict('VB','dataVB',VBlista)\n", "\n", "\n", "VBDlista = []\n", "my_dict('VBD','dataVBD',VBDlista)\n", "\n", "\n", "VBGlista = []\n", "my_dict('VBG','dataVBG',VBGlista)\n", "\n", "\n", "VBNlista = []\n", "my_dict('VBN','dataVBN',VBNlista)\n", "\n", "\n", "VBPlista = []\n", "my_dict('VBP','dataVBP',VBPlista)\n", "\n", "\n", "VBZlista = []\n", "my_dict('VBZ','dataVBZ',VBZlista)\n", "\n", "\n", "WDTlista = []\n", "my_dict('WDT','dataWDT',WDTlista)\n", "\n", "\n", "WPlista = []\n", "my_dict('WP','dataWP',WPlista)\n", "\n", "\n", "WRBlista = []\n", "my_dict('WRB','dataWRB',WRBlista)\n" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "#copy paste result from up in order to randomize at every refresh a new word\n", "\n", "CC = random.choice(CClista)\n", "CD = random.choice(CDlista)\n", "DT = random.choice(DTlista)\n", "EX = random.choice(EXlista)\n", "IN = random.choice(INlista)\n", "JJ = random.choice(JJlista)\n", "JJS = random.choice(JJSlista)\n", "MD = random.choice(MDlista)\n", "NN = random.choice(NNlista)\n", "NNS = random.choice(NNSlista)\n", "NNP = random.choice(NNPlista)\n", "PRP = random.choice(PRPlista)\n", "RB = random.choice(RBlista)\n", "TO = random.choice(TOlista)\n", "VB = random.choice(VBlista)\n", "VBD = random.choice(VBDlista)\n", "VBG = random.choice(VBGlista)\n", "VBN = random.choice(VBNlista)\n", "VBP = random.choice(VBPlista)\n", "VBZ = random.choice(VBZlista)\n", "WDT = random.choice(WDTlista)\n", "WP = random.choice(WPlista)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'however process is so-called layers since these which There are implemented and Yet'" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.6" } }, "nbformat": 4, "nbformat_minor": 4 }