tutorials 1 and 2

3 years ago · 7691a59fa9
parent 249c976532
commit 7691a59fa9
4 changed files with 804 additions and 29 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,11 @@
+click==8.1.3
+colorama==0.4.4
+Cython==0.29.28
+gensim==4.2.0
+joblib==1.1.0
+nltk==3.7
+numpy==1.22.4
+regex==2022.4.24
+scipy==1.8.1
+smart-open==6.0.0
+tqdm==4.64.0
--- a/tutorials/gensim_core.ipynb
+++ b/tutorials/gensim_core.ipynb
@ -0,0 +1,346 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Gensim tutorial 01](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Document: some text.\n",
+    "- Corpus: a collection of documents.\n",
+    "- Vector: a mathematically convenient representation of a document.\n",
+    "- Model: an algorithm for transforming vectors from one representation to another.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document = 'Lorem ipsum dolor sit amet eheh 123 gelato'\n",
+    "\n",
+    "text_corpus = [\n",
+    "    \"Human machine interface for lab abc computer applications\",\n",
+    "    \"A survey of user opinion of computer system response time\",\n",
+    "    \"The EPS user interface management system\",\n",
+    "    \"System and human system engineering testing of EPS\",\n",
+    "    \"Relation of user perceived response time to error measurement\",\n",
+    "    \"The generation of random binary unordered trees\",\n",
+    "    \"The intersection graph of paths in trees\",\n",
+    "    \"Graph minors IV Widths of trees and well quasi ordering\",\n",
+    "    \"Graph minors A survey\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cleaning the corpus\n",
+    "\n",
+    "# Create a set of frequent words\n",
+    "stoplist = set('for a of the and to in'.split(' '))\n",
+    "\n",
+    "# Lowercase each document, split it by white space and filter out stopwords\n",
+    "texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "this next line seems crazy but it reads like:\n",
+    "- for every document in the list text_corpus do this:\n",
+    "- create a list of words by splitting the document \n",
+    "- and keep the word if it's not in the stoplist  \n",
+    "\n",
+    "so the result should be a list of lists of words, one for each document "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['human', 'interface', 'computer'],\n",
+      " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
+      " ['eps', 'user', 'interface', 'system'],\n",
+      " ['system', 'human', 'system', 'eps'],\n",
+      " ['user', 'response', 'time'],\n",
+      " ['trees'],\n",
+      " ['graph', 'trees'],\n",
+      " ['graph', 'minors', 'trees'],\n",
+      " ['graph', 'minors', 'survey']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "# Count word frequencies\n",
+    "\n",
+    "# we are using defaultdict instead of a normal dictionary \n",
+    "# bc with this you can return a default value instead of an error if the key is missing in the dictionary\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "frequency = defaultdict(int)\n",
+    "for text in texts:\n",
+    "    for token in text:\n",
+    "        frequency[token] += 1\n",
+    "\n",
+    "# Only keep words that appear more than once\n",
+    "\n",
+    "processed_corpus = [[token for token in text if frequency[token]>1] for text in texts]\n",
+    "pprint.pprint(processed_corpus)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# to associate each word with an unique integer ID we use the dictionary class provided by gensim. This dictionary defines the vocabulary of all words that our processing knows about.\n",
+    "\n",
+    "from gensim import corpora\n",
+    "\n",
+    "dictionary = corpora.Dictionary(processed_corpus)\n",
+    "print(dictionary)\n",
+    "# Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'computer': 0,\n",
+      " 'eps': 8,\n",
+      " 'graph': 10,\n",
+      " 'human': 1,\n",
+      " 'interface': 2,\n",
+      " 'minors': 11,\n",
+      " 'response': 3,\n",
+      " 'survey': 4,\n",
+      " 'system': 5,\n",
+      " 'time': 6,\n",
+      " 'trees': 9,\n",
+      " 'user': 7}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print the id for each word\n",
+    "pprint.pprint(dictionary.token2id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 1), (1, 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create a bag of word for a new document based on our corpus\n",
+    "new_doc = \"Human computer interaction\"\n",
+    "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
+    "print(new_vec)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[(0, 1), (1, 1), (2, 1)],\n",
+      " [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n",
+      " [(2, 1), (5, 1), (7, 1), (8, 1)],\n",
+      " [(1, 1), (5, 2), (8, 1)],\n",
+      " [(3, 1), (6, 1), (7, 1)],\n",
+      " [(9, 1)],\n",
+      " [(9, 1), (10, 1)],\n",
+      " [(9, 1), (10, 1), (11, 1)],\n",
+      " [(4, 1), (10, 1), (11, 1)]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
+    "pprint.pprint(bow_corpus)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can use models aka way to represent documents. One simple example of a model is the `tf-idf`. The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim import models\n",
+    "\n",
+    "# train the model\n",
+    "tfidf = models.TfidfModel(bow_corpus)\n",
+    "\n",
+    "# transform the 'system minors' string\n",
+    "words = \"system minors\".lower().split()\n",
+    "print(tfidf[dictionary.doc2bow(words)])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can save the model and later load them back, to continue training or transform new documents. So the training is something that could be done through time.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim import similarities\n",
+    "\n",
+    "index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)\n",
+    "\n",
+    "query_document = 'system engineering'.lower().split()\n",
+    "query_bow = dictionary.doc2bow(query_document)\n",
+    "sims = index[tfidf[query_bow]]\n",
+    "print(list(enumerate(sims)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3 0.7184812\n",
+      "2 0.41707572\n",
+      "1 0.32448703\n",
+      "0 0.0\n",
+      "4 0.0\n",
+      "5 0.0\n",
+      "6 0.0\n",
+      "7 0.0\n",
+      "8 0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# sorting the similarities by score\n",
+    "\n",
+    "for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
+    "    print(document_number, score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.10.2 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/tutorials/gensim_core.py
+++ b/tutorials/gensim_core.py
@ -1,29 +0,0 @@
-# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
-
-import pprint
-
-# Document: some text.
-# Corpus: a collection of documents.
-# Vector: a mathematically convenient representation of a document.
-# Model: an algorithm for transforming vectors from one representation to another.
-
-document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
-
-text_corpus = [
-    "Human machine interface for lab abc computer applications",
-    "A survey of user opinion of computer system response time",
-    "The EPS user interface management system",
-    "System and human system engineering testing of EPS",
-    "Relation of user perceived response time to error measurement",
-    "The generation of random binary unordered trees",
-    "The intersection graph of paths in trees",
-    "Graph minors IV Widths of trees and well quasi ordering",
-    "Graph minors A survey",
-]
-
-# Cleaning the corpus
-
-# Create a set of frequent words
-stoplist = set('for a of the and to in'.split(' '))
-
-# Lowercase each document, split it by white space and filter out stopwords
--- a/tutorials/gensim_corpora_vector_space.ipynb
+++ b/tutorials/gensim_corpora_vector_space.ipynb
@ -0,0 +1,447 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Corpora and Vector Spaces"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Corpora and Vector Space tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# start from documents as strings\n",
+    "\n",
+    "documents = [\n",
+    "    \"Human machine interface for lab abc computer applications\",\n",
+    "    \"A survey of user opinion of computer system response time\",\n",
+    "    \"The EPS user interface management system\",\n",
+    "    \"System and human system engineering testing of EPS\",\n",
+    "    \"Relation of user perceived response time to error measurement\",\n",
+    "    \"The generation of random binary unordered trees\",\n",
+    "    \"The intersection graph of paths in trees\",\n",
+    "    \"Graph minors IV Widths of trees and well quasi ordering\",\n",
+    "    \"Graph minors A survey\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['human', 'interface', 'computer'],\n",
+      " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
+      " ['eps', 'user', 'interface', 'system'],\n",
+      " ['system', 'human', 'system', 'eps'],\n",
+      " ['user', 'response', 'time'],\n",
+      " ['trees'],\n",
+      " ['graph', 'trees'],\n",
+      " ['graph', 'minors', 'trees'],\n",
+      " ['graph', 'minors', 'survey']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# tokenize the documents, remove common words using the stoplist as well as words that only appear once\n",
+    "\n",
+    "from pprint import pprint\n",
+    "from collections import defaultdict\n",
+    "\n",
+    "# remove common words and tokenize\n",
+    "\n",
+    "stoplist = set('for a of the and to in'.split())\n",
+    "texts = [\n",
+    "    [word for word in document.lower().split() if word not in stoplist]\n",
+    "    for document in documents\n",
+    "]\n",
+    "\n",
+    "# remove words that appear only once\n",
+    "\n",
+    "frequency = defaultdict(int)\n",
+    "for text in texts:\n",
+    "    for token in text:\n",
+    "        frequency[token] += 1\n",
+    "\n",
+    "texts = [\n",
+    "    [token for token in text if frequency[token] > 1]\n",
+    "    for text in texts\n",
+    "]\n",
+    "\n",
+    "pprint(texts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To convert documents to vectors, we’ll use a document representation called bag-of-words. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-23 15:36:12,400 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n",
+      "2022-05-23 15:36:12,400 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n",
+      "2022-05-23 15:36:12,401 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-23T15:36:12.401796', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n",
+      "2022-05-23 15:36:12,401 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-05-23T15:36:12.401796', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}\n",
+      "2022-05-23 15:36:12,402 : INFO : saved /tmp/deerwester.dict\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim import corpora\n",
+    "dictionary = corpora.Dictionary(texts)\n",
+    "dictionary.save('/tmp/deerwester.dict') # store the dictionary for future refefence\n",
+    "print(dictionary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-23 15:36:12,851 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm\n",
+      "2022-05-23 15:36:12,853 : INFO : saving sparse matrix to /tmp/deerwester.mm\n",
+      "2022-05-23 15:36:12,853 : INFO : PROGRESS: saving document #0\n",
+      "2022-05-23 15:36:12,854 : INFO : saved 9x12 matrix, density=25.926% (28/108)\n",
+      "2022-05-23 15:36:12,855 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
+    "corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use\n",
+    "print(corpus)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Corpus Streaming - One Document at a Time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is usefull for working with big corpus, since they are not loaded entirely in memory at once. Instead with smart_open they can be loaded one document at a time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from smart_open import open # for transparently opening remote files\n",
+    "\n",
+    "class MyCorpus:\n",
+    "    def __iter__(self):\n",
+    "        for line in open('https://radimrehurek.com/mycorpus.txt'):\n",
+    "            # assume there's one document per line, tokens separated by whitespace\n",
+    "            yield dictionary.doc2bow(line.lower().split())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "About this yield statement:\n",
+    "https://stackoverflow.com/a/231855"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "About this MyCorpus class:\n",
+    "\n",
+    "_The assumption that each document occupies one line in a single file is not important; you can mold the \\_\\_iter\\_\\_ function to fit your input format, whatever it is. Walking directories, parsing XML, accessing the network… Just parse your input to retrieve a clean list of tokens in each document, then convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside \\_\\_iter\\_\\_._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<__main__.MyCorpus object at 0x000002362CFCA530>\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!\n",
+    "print(corpus_memory_friendly)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, 1), (1, 1), (2, 1)]\n",
+      "[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n",
+      "[(2, 1), (5, 1), (7, 1), (8, 1)]\n",
+      "[(1, 1), (5, 2), (8, 1)]\n",
+      "[(3, 1), (6, 1), (7, 1)]\n",
+      "[(9, 1)]\n",
+      "[(9, 1), (10, 1)]\n",
+      "[(9, 1), (10, 1), (11, 1)]\n",
+      "[(4, 1), (10, 1), (11, 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for vector in corpus_memory_friendly: #Load one vector into memory at a time\n",
+    "    print(vector)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Although the output is the same as for the plain Python list, the corpus is now much more memory friendly, because at most one vector resides in RAM at a time. Your corpus can now be as large as you want."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-23 15:42:28,778 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n",
+      "2022-05-23 15:42:28,779 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)\n",
+      "2022-05-23 15:42:28,779 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)\", 'datetime': '2022-05-23T15:42:28.779876', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We can also construct dictionary without loading all texts into memory\n",
+    "\n",
+    "# collect statistics about all tokens\n",
+    "\n",
+    "dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))\n",
+    "\n",
+    "\n",
+    "\n",
+    "stop_ids = [ \n",
+    "    dictionary.token2id[stopword]\n",
+    "    for stopword in stoplist\n",
+    "    if stopword in dictionary.token2id\n",
+    "]\n",
+    "\n",
+    "once_ids = [\n",
+    "    tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1\n",
+    "]\n",
+    "\n",
+    "dictionary.filter_tokens(stop_ids + once_ids) # remove stopwords and words that appear only once\n",
+    "dictionary.compactify() # remove gaps in id sequence after words that were removed\n",
+    "print(dictionary)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Corpus Formats"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There exist several file formats for serializing a Vector Space corpus (~sequence of vectors) to disk. Gensim implements them via the streaming corpus interface mentioned earlier: documents are read from (resp. stored to) disk in a lazy fashion, one document at a time, without the whole corpus being read into main memory at once.\n",
+    "\n",
+    "One of the more notable file formats is the Market Matrix format. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-05-23 15:50:56,704 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm\n",
+      "2022-05-23 15:50:56,705 : INFO : saving sparse matrix to /tmp/corpus.mm\n",
+      "2022-05-23 15:50:56,706 : INFO : PROGRESS: saving document #0\n",
+      "2022-05-23 15:50:56,707 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n",
+      "2022-05-23 15:50:56,708 : INFO : saving MmCorpus index to /tmp/corpus.mm.index\n",
+      "2022-05-23 15:50:56,711 : INFO : loaded corpus index from /tmp/corpus.mm.index\n",
+      "2022-05-23 15:50:56,711 : INFO : initializing cython corpus reader from /tmp/corpus.mm\n",
+      "2022-05-23 15:50:56,714 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n"
+     ]
+    }
+   ],
+   "source": [
+    "corpus = [[(1, 0.5)], []] # two documents (one is empty!)\n",
+    "\n",
+    "\n",
+    "# To save\n",
+    "corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)\n",
+    "\n",
+    "# To load\n",
+    "corpus = corpora.MmCorpus('/tmp/corpus.mm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MmCorpus(2 documents, 2 features, 1 non-zero entries)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Corpus objects are streams, so typically you won’t be able to print them directly:\n",
+    "print(corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[(1, 0.5)], []]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# one way of printing a corpus: load it entirely into memory\n",
+    "\n",
+    "print(list(corpus))  # calling list() will convert any sequence to a plain Python list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(1, 0.5)]\n",
+      "[]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# another way of doing it: print one document at a time, making use of the streaming interface\n",
+    "# (more memory friendly)\n",
+    "for doc in corpus:\n",
+    "    print(doc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.10.2 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}