You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

746 lines
37 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pattern.search import STRICT, search\n",
"from pattern.en import parsetree"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://github.com/clips/pattern/wiki/pattern-search\n",
"( inspired by [videogrep](https://github.com/antiboredom/videogrep/blob/master/videogrep/searcher.py) search )"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"text = open(\"../txt/words-for-the-future/OTHERNESS.txt\").read()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Otherness | Daniel L. Everett\\n\\nWhen I was 26, I moved to the Amazon, from California, in order to st'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text[:100]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"ename": "RuntimeError",
"evalue": "generator raised StopIteration",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(path, encoding, comment)\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 609\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mStopIteration\u001b[0m: ",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-3-ac287bca8f52>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtree\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsetree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/en/__init__.py\u001b[0m in \u001b[0;36mparsetree\u001b[0;34m(s, *args, **kwargs)\u001b[0m\n\u001b[1;32m 173\u001b[0m \"\"\" Returns a parsed Text from the given string.\n\u001b[1;32m 174\u001b[0m \"\"\"\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mText\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/en/__init__.py\u001b[0m in \u001b[0;36mparse\u001b[0;34m(s, *args, **kwargs)\u001b[0m\n\u001b[1;32m 167\u001b[0m \"\"\" Returns a tagged Unicode string.\n\u001b[1;32m 168\u001b[0m \"\"\"\n\u001b[0;32m--> 169\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36mparse\u001b[0;34m(self, s, tokenize, tags, chunks, relations, lemmata, encoding, **kwargs)\u001b[0m\n\u001b[1;32m 1170\u001b[0m \u001b[0;31m# Tagger (required by chunker, labeler & lemmatizer).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1171\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtags\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mrelations\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlemmata\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1172\u001b[0;31m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_tags\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1173\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1174\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mw\u001b[0m \u001b[0;32min\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/en/__init__.py\u001b[0m in \u001b[0;36mfind_tags\u001b[0;34m(self, tokens, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"tagset\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mUNIVERSAL\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"map\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtag\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mpenntreebank2universal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtag\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 114\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_Parser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_tags\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36mfind_tags\u001b[0;34m(self, tokens, **kwargs)\u001b[0m\n\u001b[1;32m 1111\u001b[0m \u001b[0;31m# [\"The\", \"cat\", \"purs\"] => [[\"The\", \"DT\"], [\"cat\", \"NN\"], [\"purs\", \"VB\"]]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1112\u001b[0m return find_tags(tokens,\n\u001b[0;32m-> 1113\u001b[0;31m \u001b[0mlexicon\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"lexicon\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlexicon\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1114\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"model\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1115\u001b[0m \u001b[0mmorphology\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"morphology\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmorphology\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36m__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 375\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 376\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lazy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"__len__\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 377\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36m_lazy\u001b[0;34m(self, method, *args)\u001b[0m\n\u001b[1;32m 366\u001b[0m \"\"\"\n\u001b[1;32m 367\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 368\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 369\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMethodType\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;31m# Arnold NNP x\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0mdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;31m#--- FREQUENCY -------------------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.local/lib/python3.7/site-packages/pattern/text/__init__.py\u001b[0m in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;31m# Arnold NNP x\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0mdict\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_path\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\" \"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;31m#--- FREQUENCY -------------------------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: generator raised StopIteration"
]
}
],
"source": [
"tree = parsetree(text)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tree' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-7a1e081e78d4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtree\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'tree' is not defined"
]
}
],
"source": [
"tree"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Sentence(\"This/DT/O/O encounter/RB/B-ADVP/O with/IN/B-PP/O these/DT/O/O /''/O/O others/NNS/B-NP/O ,/,/O/O /''/O/O so/RB/B-ADVP/O unlike/IN/B-PP/B-PNP myself/PRP/B-NP/I-PNP ,/,/O/O was/VBD/B-VP/O to/TO/I-VP/O be/VB/I-VP/O the/DT/O/O defining/VBG/B-VP/O experience/NN/B-NP/O for/IN/B-PP/B-PNP the/DT/B-NP/I-PNP rest/NN/I-NP/I-PNP of/IN/B-PP/B-PNP my/PRP$/B-NP/I-PNP life/NN/I-NP/I-PNP ././O/O\")"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tree[7]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"data": {
"text/plain": [
"[Match(words=[Word('unrelated/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('small/JJ')]),\n",
" Match(words=[Word('missionary/JJ')]),\n",
" Match(words=[Word('bumpy/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('weak/JJ')]),\n",
" Match(words=[Word('taut/JJ')]),\n",
" Match(words=[Word('unrelated/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('many/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('easy/JJ')]),\n",
" Match(words=[Word('enough/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('uncomfortable/JJ')]),\n",
" Match(words=[Word('suspicious/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('simple/JJ')]),\n",
" Match(words=[Word('binary/JJ')]),\n",
" Match(words=[Word('old/JJ')]),\n",
" Match(words=[Word('religious/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('unintelligible/JJ')]),\n",
" Match(words=[Word('different-looking/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('live/JJ')]),\n",
" Match(words=[Word('unacceptable/JJ')]),\n",
" Match(words=[Word('missionary/JJ')]),\n",
" Match(words=[Word('eternal/JJ')]),\n",
" Match(words=[Word('encounter/JJ')]),\n",
" Match(words=[Word('uneasy/JJ')]),\n",
" Match(words=[Word('dangerous/JJ')]),\n",
" Match(words=[Word('insufficient/JJ')]),\n",
" Match(words=[Word('ethno-centric/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('fortunate/JJ')]),\n",
" Match(words=[Word('gentle/JJ')]),\n",
" Match(words=[Word('many/JJ')]),\n",
" Match(words=[Word('silly/JJ')]),\n",
" Match(words=[Word('years-long/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('young/JJ')]),\n",
" Match(words=[Word('large/JJ')]),\n",
" Match(words=[Word('small/JJ')]),\n",
" Match(words=[Word('fresh/JJ')]),\n",
" Match(words=[Word('young/JJ')]),\n",
" Match(words=[Word('then-unintelligible/JJ')]),\n",
" Match(words=[Word('easy/JJ')]),\n",
" Match(words=[Word('polite/JJ')]),\n",
" Match(words=[Word('Many/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('Western/JJ')]),\n",
" Match(words=[Word('polite/JJ')]),\n",
" Match(words=[Word('Western/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('close/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('small/JJ')]),\n",
" Match(words=[Word('early/JJ')]),\n",
" Match(words=[Word('subsequent/JJ')]),\n",
" Match(words=[Word('individual/JJ')]),\n",
" Match(words=[Word('normal/JJ')]),\n",
" Match(words=[Word('comfortable/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('dissonant/JJ')]),\n",
" Match(words=[Word('steady/JJ')]),\n",
" Match(words=[Word('familiar/JJ')]),\n",
" Match(words=[Word('Comfort/JJ')]),\n",
" Match(words=[Word('acquired/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('sexual/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('biological/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('possible/JJ')]),\n",
" Match(words=[Word('worthwhile/JJ')]),\n",
" Match(words=[Word('obvious/JJ')]),\n",
" Match(words=[Word('first/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('behavioral/JJ')]),\n",
" Match(words=[Word('earliest/JJ')]),\n",
" Match(words=[Word('normal/JJ')]),\n",
" Match(words=[Word('correct/JJ')]),\n",
" Match(words=[Word('crucial/JJ')]),\n",
" Match(words=[Word('in-group/JJ')]),\n",
" Match(words=[Word('social/JJ')]),\n",
" Match(words=[Word('else/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('familiar/JJ')]),\n",
" Match(words=[Word('fit/JJ')]),\n",
" Match(words=[Word('several/JJ')]),\n",
" Match(words=[Word('Pirahã/JJ')]),\n",
" Match(words=[Word('full/JJ')]),\n",
" Match(words=[Word('noticed/JJ')]),\n",
" Match(words=[Word('old/JJ')]),\n",
" Match(words=[Word('sharp/JJ')]),\n",
" Match(words=[Word('30cm/JJ')]),\n",
" Match(words=[Word('dangerous/JJ')]),\n",
" Match(words=[Word('handed/JJ')]),\n",
" Match(words=[Word('non-life-threatening/JJ')]),\n",
" Match(words=[Word('necessary/JJ')]),\n",
" Match(words=[Word('Dutch/JJ')]),\n",
" Match(words=[Word('sharp/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('m)otherness/JJ')]),\n",
" Match(words=[Word('sure/JJ')]),\n",
" Match(words=[Word('able/JJ')]),\n",
" Match(words=[Word('occasional/JJ')]),\n",
" Match(words=[Word('interesting/JJ')]),\n",
" Match(words=[Word('crooked/JJ')]),\n",
" Match(words=[Word('straight/JJ')]),\n",
" Match(words=[Word('bizarre/JJ')]),\n",
" Match(words=[Word('American/JJ')]),\n",
" Match(words=[Word('missionary/JJ')]),\n",
" Match(words=[Word('excited/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('Pirahã/JJ')]),\n",
" Match(words=[Word('native/JJ')]),\n",
" Match(words=[Word('native/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('local/JJ')]),\n",
" Match(words=[Word('comfortable/JJ')]),\n",
" Match(words=[Word('similar/JJ')]),\n",
" Match(words=[Word('many/JJ')]),\n",
" Match(words=[Word('favorite/JJ')]),\n",
" Match(words=[Word('American/JJ')]),\n",
" Match(words=[Word('different/JJ')]),\n",
" Match(words=[Word('irrelevant/JJ')]),\n",
" Match(words=[Word('brilliant/JJ')]),\n",
" Match(words=[Word('boring/JJ')]),\n",
" Match(words=[Word('adjacent/JJ')]),\n",
" Match(words=[Word('full/JJ')]),\n",
" Match(words=[Word('brilliant/JJ')]),\n",
" Match(words=[Word('good/JJ')]),\n",
" Match(words=[Word('human/JJ')]),\n",
" Match(words=[Word('independent/JJ')]),\n",
" Match(words=[Word('natural/JJ')]),\n",
" Match(words=[Word('solitary/JJ')]),\n",
" Match(words=[Word('strange/JJ')]),\n",
" Match(words=[Word('slow/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('original/JJ')]),\n",
" Match(words=[Word('paradoxical/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('panoramic/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('unique/JJ')]),\n",
" Match(words=[Word('important/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('individual/JJ')]),\n",
" Match(words=[Word('s/JJ')]),\n",
" Match(words=[Word('small/JJ')]),\n",
" Match(words=[Word('read/JJ')]),\n",
" Match(words=[Word('possible/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('i]The/JJ')]),\n",
" Match(words=[Word('poor/JJ')]),\n",
" Match(words=[Word('good/JJ')]),\n",
" Match(words=[Word('measurable/JJ')]),\n",
" Match(words=[Word('daily/JJ')]),\n",
" Match(words=[Word('social/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('conceptual/JJ')]),\n",
" Match(words=[Word('cultural/JJ')]),\n",
" Match(words=[Word('social/JJ')]),\n",
" Match(words=[Word('predictable/JJ')]),\n",
" Match(words=[Word('predictable/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('thinking/JJ')]),\n",
" Match(words=[Word('strong/JJ')]),\n",
" Match(words=[Word('desirable/JJ')]),\n",
" Match(words=[Word('unexpected/JJ')]),\n",
" Match(words=[Word('constant/JJ')]),\n",
" Match(words=[Word('useful/JJ')]),\n",
" Match(words=[Word('biological/JJ')]),\n",
" Match(words=[Word('cognitive/JJ')]),\n",
" Match(words=[Word('cultural/JJ')]),\n",
" Match(words=[Word('unsuccessful/JJ')]),\n",
" Match(words=[Word('strange/JJ')]),\n",
" Match(words=[Word('successful/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('such/JJ')]),\n",
" Match(words=[Word('political/JJ')]),\n",
" Match(words=[Word('important/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('unable/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('little/JJ')]),\n",
" Match(words=[Word('18th/JJ')]),\n",
" Match(words=[Word('identical/JJ')]),\n",
" Match(words=[Word('light/JJ')]),\n",
" Match(words=[Word('multiple/JJ')]),\n",
" Match(words=[Word('familiar/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('anti-immigration/JJ')]),\n",
" Match(words=[Word('political/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('motivated/JJ')]),\n",
" Match(words=[Word('ultimate/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('cognitive/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('new/JJ')]),\n",
" Match(words=[Word('own/JJ')]),\n",
" Match(words=[Word('only/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('otherness/JJ')]),\n",
" Match(words=[Word('invented/JJ')]),\n",
" Match(words=[Word('communal/JJ')]),\n",
" Match(words=[Word('cultural/JJ')]),\n",
" Match(words=[Word('cultural/JJ')]),\n",
" Match(words=[Word('human/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('distinct/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('s/JJ')]),\n",
" Match(words=[Word('other/JJ')]),\n",
" Match(words=[Word('Amazonian/JJ')]),\n",
" Match(words=[Word('doomed/JJ')]),\n",
" Match(words=[Word('Greek/JJ')]),\n",
" Match(words=[Word('repetitive/JJ')]),\n",
" Match(words=[Word('daily/JJ')]),\n",
" Match(words=[Word('huge/JJ')]),\n",
" Match(words=[Word('only/JJ')]),\n",
" Match(words=[Word('same/JJ')]),\n",
" Match(words=[Word('next/JJ')])]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"search(\"JJ\", tree)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Match(words=[Word('study/VB'), Word('the/DT'), Word('language/NN')]),\n",
" Match(words=[Word('be/VB'), Word('a/DT'), Word('prostitute/NN')]),\n",
" Match(words=[Word('seem/VB'), Word('that/DT'), Word('way/NN')]),\n",
" Match(words=[Word('conduct/VB'), Word('a/DT'), Word('pilot/NN')]),\n",
" Match(words=[Word('let/VB'), Word('the/DT'), Word('stick/NN')]),\n",
" Match(words=[Word('remove/VB'), Word('the/DT'), Word('otherness/NN')]),\n",
" Match(words=[Word('occupy/VB'), Word('a/DT'), Word('part/NN')])]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"search('VB DT NN', tree)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"study the language\n",
"be a prostitute\n",
"seem that way\n",
"conduct a pilot\n",
"let the stick\n",
"remove the otherness\n",
"occupy a part\n"
]
}
],
"source": [
"for m in search (\"VB DT NN\", tree):\n",
" print (f\"{m.string}\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'occupy a part'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m.string"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"My body\n",
"my brain\n",
"My task\n",
"my life\n",
"our species\n",
"our child\n",
"their differences\n",
"my belief\n",
"my encounter\n",
"my own\n",
"my silly beliefs\n",
"my life\n",
"my first day\n",
"his hut\n",
"its tongue\n",
"our mother\n",
"our mother\n",
"our father\n",
"our first experiences\n",
"our values\n",
"our mother and the select\n",
"our subsequent lives\n",
"Our earliest associations\n",
"our narrow range\n",
"our in-group\n",
"my own writings.[1\n",
"our family or our village\n",
"our own identity\n",
"our identity\n",
"our family\n",
"our norm\n",
"our experience\n",
"our expectations\n",
"its occupants\n",
"their beliefs and children\n",
"his face\n",
"his mother\n",
"her toddler\n",
"her child\n",
"his quasi-stabbing\n",
"her child\n",
"her child a sharp knife\n",
"its contribution\n",
"our lives\n",
"their language\n",
"their translations\n",
"their comments\n",
"my request\n",
"their language\n",
"our behavior\n",
"their language\n",
"their culture\n",
"their language\n",
"Our sense\n",
"our enveloping\n",
"our childhood development\n",
"our conversations and the structures\n",
"our interactions\n",
"their phrases\n",
"my favorite book\n",
"his year\n",
"His year\n",
"its institutions\n",
"our senses\n",
"our sense\n",
"his lessons\n",
"our sense\n",
"our own unique identity\n",
"our oneness\n",
"my life\n",
"his own question\n",
"his essay\n",
"our behavior\n",
"His example\n",
"his case\n",
"our lives\n",
"our lives\n",
"our expectations\n",
"our environments\n",
"our own\n",
"our Homo ancestors\n",
"our environment\n",
"their language\n",
"their culture and language\n",
"our familiar environment\n",
"our world\n",
"Our preference\n",
"our fear then itself\n",
"Our languages and cognitive abilities\n",
"their relationships\n",
"our own\n",
"our species ability\n",
"our human\n",
"our greatest fears\n",
"our greatest treasure\n",
"his efforts\n"
]
}
],
"source": [
"for m in search (\"PRP$ *\", tree):\n",
" print (f\"{m.string}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from pattern.en import wordnet"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"sense = wordnet.synsets(\"language\")[0]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Synset('communication.n.02')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sense.hypernym"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"matching neighbor\n",
"matching friend\n",
"matching child\n",
"matching woman\n",
"matching prostitute\n",
"matching man\n",
"matching man\n",
"matching guest\n",
"matching host\n",
"matching mother\n",
"matching mother\n",
"matching father\n",
"matching mother\n",
"matching professor\n",
"matching cowboy\n",
"matching psychologist\n",
"matching pilot\n",
"matching toddler\n",
"matching mother\n",
"matching mother\n",
"matching toddler\n",
"matching woman\n",
"matching baby\n",
"matching child\n",
"matching mother\n",
"matching baby\n",
"matching mother\n",
"matching child\n",
"matching mother\n",
"matching child\n",
"matching child\n",
"matching child\n",
"matching speaker\n",
"matching speaker\n",
"matching foreigner\n",
"matching tourist\n",
"matching friend\n",
"matching man\n",
"matching foreigner\n",
"matching handyman\n",
"matching stranger\n",
"matching Homo\n",
"matching Homo\n",
"matching Homo\n"
]
}
],
"source": [
"output = []\n",
"search_word=\"person\"\n",
"for search_word in search_word.split('|'):\n",
" synset = wordnet.synsets(search_word)[0]\n",
" pos = synset.pos\n",
" possible_words = search(pos, tree)\n",
" for match in possible_words:\n",
" # print (f\"match {match}\")\n",
" word = match[0].string\n",
" synsets = wordnet.synsets(word)\n",
" if len(synsets) > 0:\n",
" hypernyms = synsets[0].hypernyms(recursive=True)\n",
" if any(search_word == h.senses[0] for h in hypernyms):\n",
" print(f\"matching {word}\")\n",
" output.append(word)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['phrase']"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}