SI13_federico_patches/NaturalSentences.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import random\n",
    "s = ' '"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = open('language.txt').read().replace('.','').replace(',','').replace('(','').replace(')','').replace(':','').replace(';','')\n",
    "text = text.split()\n",
    "textSet = set(text)\n",
    "tagged = nltk.pos_tag(textSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "#create array of single sigles, in order to use them as tags for dictionaries\n",
    "\n",
    "sigle = '''\n",
    "1. \tCC \tCoordinating conjunction\n",
    "2. \tCD \tCardinal number\n",
    "3. \tDT \tDeterminer\n",
    "4. \tEX \tExistential there\n",
    "5. \tFW \tForeign word\n",
    "6. \tIN \tPreposition or subordinating conjunction\n",
    "7. \tJJ \tAdjective\n",
    "8. \tJJR \tAdjective, comparative\n",
    "9. \tJJS \tAdjective, superlative\n",
    "10. \tLS \tList item marker\n",
    "11. \tMD \tModal\n",
    "12. \tNN \tNoun, singular or mass\n",
    "13. \tNNS \tNoun, plural\n",
    "14. \tNNP \tProper noun, singular\n",
    "15. \tNNPS \tProper noun, plural\n",
    "16. \tPDT \tPredeterminer\n",
    "17. \tPOS \tPossessive ending\n",
    "18. \tPRP \tPersonal pronoun\n",
    "19. \tPRP$ \tPossessive pronoun\n",
    "20. \tRB \tAdverb\n",
    "21. \tRBR \tAdverb, comparative\n",
    "22. \tRBS \tAdverb, superlative\n",
    "23. \tRP \tParticle\n",
    "24. \tSYM \tSymbol\n",
    "25. \tTO \tto\n",
    "26. \tUH \tInterjection\n",
    "27. \tVB \tVerb, base form\n",
    "28. \tVBD \tVerb, past tense\n",
    "29. \tVBG \tVerb, gerund or present participle\n",
    "30. \tVBN \tVerb, past participle\n",
    "31. \tVBP \tVerb, non-3  person singular present\n",
    "32. \tVBZ \tVerb, 3  person singular present\n",
    "33. \tWDT \tWh-determiner\n",
    "34. \tWP \tWh-pronoun\n",
    "36. \tWRB \tWh-adverb \n",
    "'''\n",
    "\n",
    "sigle = sigle.replace('.','').split()\n",
    "sigle = [sigle for sigle in sigle if len(sigle) < 4 and not sigle.isdigit() and not sigle == 'or' and not sigle == 'to' and not sigle == '3rd']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP\n"
     ]
    }
   ],
   "source": [
    "#This is for prepare a grammar constructio based on a picked random part of text\n",
    "\n",
    "s = ' '\n",
    "textlines = open('language.txt').readlines()\n",
    "nl = len(textlines)\n",
    "r = random.randrange(0,nl)\n",
    "line = textlines[r]\n",
    "sentence = line.split()\n",
    "sentence = nltk.pos_tag(sentence)\n",
    "\n",
    "dat = {}\n",
    "\n",
    "for word, tag in sentence:\n",
    "    dat[tag] = word\n",
    "    \n",
    "keys = dat.keys()\n",
    "\n",
    "print(\" + s + \".join([pos for pos in keys])) #copypaste the result to generate sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CClista = []\n",
      "my_dict('CC','dataCC',CClista)\n",
      "\n",
      "\n",
      "CDlista = []\n",
      "my_dict('CD','dataCD',CDlista)\n",
      "\n",
      "\n",
      "DTlista = []\n",
      "my_dict('DT','dataDT',DTlista)\n",
      "\n",
      "\n",
      "EXlista = []\n",
      "my_dict('EX','dataEX',EXlista)\n",
      "\n",
      "\n",
      "FWlista = []\n",
      "my_dict('FW','dataFW',FWlista)\n",
      "\n",
      "\n",
      "INlista = []\n",
      "my_dict('IN','dataIN',INlista)\n",
      "\n",
      "\n",
      "JJlista = []\n",
      "my_dict('JJ','dataJJ',JJlista)\n",
      "\n",
      "\n",
      "JJRlista = []\n",
      "my_dict('JJR','dataJJR',JJRlista)\n",
      "\n",
      "\n",
      "JJSlista = []\n",
      "my_dict('JJS','dataJJS',JJSlista)\n",
      "\n",
      "\n",
      "LSlista = []\n",
      "my_dict('LS','dataLS',LSlista)\n",
      "\n",
      "\n",
      "MDlista = []\n",
      "my_dict('MD','dataMD',MDlista)\n",
      "\n",
      "\n",
      "NNlista = []\n",
      "my_dict('NN','dataNN',NNlista)\n",
      "\n",
      "\n",
      "NNSlista = []\n",
      "my_dict('NNS','dataNNS',NNSlista)\n",
      "\n",
      "\n",
      "NNPlista = []\n",
      "my_dict('NNP','dataNNP',NNPlista)\n",
      "\n",
      "\n",
      "PDTlista = []\n",
      "my_dict('PDT','dataPDT',PDTlista)\n",
      "\n",
      "\n",
      "POSlista = []\n",
      "my_dict('POS','dataPOS',POSlista)\n",
      "\n",
      "\n",
      "PRPlista = []\n",
      "my_dict('PRP','dataPRP',PRPlista)\n",
      "\n",
      "\n",
      "RBlista = []\n",
      "my_dict('RB','dataRB',RBlista)\n",
      "\n",
      "\n",
      "RBRlista = []\n",
      "my_dict('RBR','dataRBR',RBRlista)\n",
      "\n",
      "\n",
      "RBSlista = []\n",
      "my_dict('RBS','dataRBS',RBSlista)\n",
      "\n",
      "\n",
      "RPlista = []\n",
      "my_dict('RP','dataRP',RPlista)\n",
      "\n",
      "\n",
      "SYMlista = []\n",
      "my_dict('SYM','dataSYM',SYMlista)\n",
      "\n",
      "\n",
      "TOlista = []\n",
      "my_dict('TO','dataTO',TOlista)\n",
      "\n",
      "\n",
      "UHlista = []\n",
      "my_dict('UH','dataUH',UHlista)\n",
      "\n",
      "\n",
      "VBlista = []\n",
      "my_dict('VB','dataVB',VBlista)\n",
      "\n",
      "\n",
      "VBDlista = []\n",
      "my_dict('VBD','dataVBD',VBDlista)\n",
      "\n",
      "\n",
      "VBGlista = []\n",
      "my_dict('VBG','dataVBG',VBGlista)\n",
      "\n",
      "\n",
      "VBNlista = []\n",
      "my_dict('VBN','dataVBN',VBNlista)\n",
      "\n",
      "\n",
      "VBPlista = []\n",
      "my_dict('VBP','dataVBP',VBPlista)\n",
      "\n",
      "\n",
      "VBZlista = []\n",
      "my_dict('VBZ','dataVBZ',VBZlista)\n",
      "\n",
      "\n",
      "WDTlista = []\n",
      "my_dict('WDT','dataWDT',WDTlista)\n",
      "\n",
      "\n",
      "WPlista = []\n",
      "my_dict('WP','dataWP',WPlista)\n",
      "\n",
      "\n",
      "WRBlista = []\n",
      "my_dict('WRB','dataWRB',WRBlista)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for gr in sigle: #to create storing list for various pos\n",
    "    print(f'{gr}lista = []'.replace('$','ç')) \n",
    "    print(f'''my_dict('{gr}','data{gr}',{gr}lista)'''.replace('$','ç'))\n",
    "    print('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CC = random.choice(CClista)\n",
      "CD = random.choice(CDlista)\n",
      "DT = random.choice(DTlista)\n",
      "EX = random.choice(EXlista)\n",
      "IN = random.choice(INlista)\n",
      "JJ = random.choice(JJlista)\n",
      "JJS = random.choice(JJSlista)\n",
      "MD = random.choice(MDlista)\n",
      "NN = random.choice(NNlista)\n",
      "NNS = random.choice(NNSlista)\n",
      "NNP = random.choice(NNPlista)\n",
      "PRP = random.choice(PRPlista)\n",
      "RB = random.choice(RBlista)\n",
      "TO = random.choice(TOlista)\n",
      "VB = random.choice(VBlista)\n",
      "VBD = random.choice(VBDlista)\n",
      "VBG = random.choice(VBGlista)\n",
      "VBN = random.choice(VBNlista)\n",
      "VBP = random.choice(VBPlista)\n",
      "VBZ = random.choice(VBZlista)\n",
      "WDT = random.choice(WDTlista)\n",
      "WP = random.choice(WPlista)\n"
     ]
    }
   ],
   "source": [
    "dataset = {} \n",
    "\n",
    "def my_dict(gr,data,grlista): #to store words in pos lists\n",
    "\n",
    "    data = {}\n",
    "\n",
    "    for word, tag in tagged:\n",
    "        dataset[tag] = word\n",
    "        if tag == gr:\n",
    "            data[tag] = word\n",
    "            grlista.append(word)\n",
    "    for x in data:\n",
    "        if len(grlista) == 0:\n",
    "            None\n",
    "        else:\n",
    "            print(f'{gr} = random.choice({gr}lista)'.replace('$','ç'))            #to print the random picker, if the list is empty, doesn't print the randomic variable for that pos\n",
    "            \n",
    "#copy paste from up:\n",
    "            \n",
    "            \n",
    "CClista = []\n",
    "my_dict('CC','dataCC',CClista)\n",
    "\n",
    "\n",
    "CDlista = []\n",
    "my_dict('CD','dataCD',CDlista)\n",
    "\n",
    "\n",
    "DTlista = []\n",
    "my_dict('DT','dataDT',DTlista)\n",
    "\n",
    "\n",
    "EXlista = []\n",
    "my_dict('EX','dataEX',EXlista)\n",
    "\n",
    "\n",
    "FWlista = []\n",
    "my_dict('FW','dataFW',FWlista)\n",
    "\n",
    "\n",
    "INlista = []\n",
    "my_dict('IN','dataIN',INlista)\n",
    "\n",
    "\n",
    "JJlista = []\n",
    "my_dict('JJ','dataJJ',JJlista)\n",
    "\n",
    "\n",
    "JJRlista = []\n",
    "my_dict('JJR','dataJJR',JJRlista)\n",
    "\n",
    "\n",
    "JJSlista = []\n",
    "my_dict('JJS','dataJJS',JJSlista)\n",
    "\n",
    "\n",
    "LSlista = []\n",
    "my_dict('LS','dataLS',LSlista)\n",
    "\n",
    "\n",
    "MDlista = []\n",
    "my_dict('MD','dataMD',MDlista)\n",
    "\n",
    "\n",
    "NNlista = []\n",
    "my_dict('NN','dataNN',NNlista)\n",
    "\n",
    "\n",
    "NNSlista = []\n",
    "my_dict('NNS','dataNNS',NNSlista)\n",
    "\n",
    "\n",
    "NNPlista = []\n",
    "my_dict('NNP','dataNNP',NNPlista)\n",
    "\n",
    "\n",
    "PDTlista = []\n",
    "my_dict('PDT','dataPDT',PDTlista)\n",
    "\n",
    "\n",
    "POSlista = []\n",
    "my_dict('POS','dataPOS',POSlista)\n",
    "\n",
    "\n",
    "PRPlista = []\n",
    "my_dict('PRP','dataPRP',PRPlista)\n",
    "\n",
    "\n",
    "RBlista = []\n",
    "my_dict('RB','dataRB',RBlista)\n",
    "\n",
    "\n",
    "RBRlista = []\n",
    "my_dict('RBR','dataRBR',RBRlista)\n",
    "\n",
    "\n",
    "RBSlista = []\n",
    "my_dict('RBS','dataRBS',RBSlista)\n",
    "\n",
    "\n",
    "RPlista = []\n",
    "my_dict('RP','dataRP',RPlista)\n",
    "\n",
    "\n",
    "SYMlista = []\n",
    "my_dict('SYM','dataSYM',SYMlista)\n",
    "\n",
    "\n",
    "TOlista = []\n",
    "my_dict('TO','dataTO',TOlista)\n",
    "\n",
    "\n",
    "UHlista = []\n",
    "my_dict('UH','dataUH',UHlista)\n",
    "\n",
    "\n",
    "VBlista = []\n",
    "my_dict('VB','dataVB',VBlista)\n",
    "\n",
    "\n",
    "VBDlista = []\n",
    "my_dict('VBD','dataVBD',VBDlista)\n",
    "\n",
    "\n",
    "VBGlista = []\n",
    "my_dict('VBG','dataVBG',VBGlista)\n",
    "\n",
    "\n",
    "VBNlista = []\n",
    "my_dict('VBN','dataVBN',VBNlista)\n",
    "\n",
    "\n",
    "VBPlista = []\n",
    "my_dict('VBP','dataVBP',VBPlista)\n",
    "\n",
    "\n",
    "VBZlista = []\n",
    "my_dict('VBZ','dataVBZ',VBZlista)\n",
    "\n",
    "\n",
    "WDTlista = []\n",
    "my_dict('WDT','dataWDT',WDTlista)\n",
    "\n",
    "\n",
    "WPlista = []\n",
    "my_dict('WP','dataWP',WPlista)\n",
    "\n",
    "\n",
    "WRBlista = []\n",
    "my_dict('WRB','dataWRB',WRBlista)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "#copy paste result from up in order to randomize at every refresh a new word\n",
    "\n",
    "CC = random.choice(CClista)\n",
    "CD = random.choice(CDlista)\n",
    "DT = random.choice(DTlista)\n",
    "EX = random.choice(EXlista)\n",
    "IN = random.choice(INlista)\n",
    "JJ = random.choice(JJlista)\n",
    "JJS = random.choice(JJSlista)\n",
    "MD = random.choice(MDlista)\n",
    "NN = random.choice(NNlista)\n",
    "NNS = random.choice(NNSlista)\n",
    "NNP = random.choice(NNPlista)\n",
    "PRP = random.choice(PRPlista)\n",
    "RB = random.choice(RBlista)\n",
    "TO = random.choice(TOlista)\n",
    "VB = random.choice(VBlista)\n",
    "VBD = random.choice(VBDlista)\n",
    "VBG = random.choice(VBGlista)\n",
    "VBN = random.choice(VBNlista)\n",
    "VBP = random.choice(VBPlista)\n",
    "VBZ = random.choice(VBZlista)\n",
    "WDT = random.choice(WDTlista)\n",
    "WP = random.choice(WPlista)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'however process is so-called layers since these which There are implemented and Yet'"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "RB + s + NN + s + VBZ + s + JJ + s + NNS + s + IN + s + DT + s + WDT + s + EX + s + VBP + s + VBN + s + CC + s + NNP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}