From 7691a59fa9239a7d2ac6b4da6eb480a7a4d81719 Mon Sep 17 00:00:00 2001 From: Francesco Luzzana Date: Mon, 23 May 2022 16:12:20 +0200 Subject: [PATCH] tutorials 1 and 2 --- requirements.txt | 11 + tutorials/gensim_core.ipynb | 346 +++++++++++++++ tutorials/gensim_core.py | 29 -- tutorials/gensim_corpora_vector_space.ipynb | 447 ++++++++++++++++++++ 4 files changed, 804 insertions(+), 29 deletions(-) create mode 100644 requirements.txt create mode 100644 tutorials/gensim_core.ipynb delete mode 100644 tutorials/gensim_core.py create mode 100644 tutorials/gensim_corpora_vector_space.ipynb diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1ee2fc8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +click==8.1.3 +colorama==0.4.4 +Cython==0.29.28 +gensim==4.2.0 +joblib==1.1.0 +nltk==3.7 +numpy==1.22.4 +regex==2022.4.24 +scipy==1.8.1 +smart-open==6.0.0 +tqdm==4.64.0 diff --git a/tutorials/gensim_core.ipynb b/tutorials/gensim_core.ipynb new file mode 100644 index 0000000..c3da2fb --- /dev/null +++ b/tutorials/gensim_core.ipynb @@ -0,0 +1,346 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Gensim tutorial 01](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Document: some text.\n", + "- Corpus: a collection of documents.\n", + "- Vector: a mathematically convenient representation of a document.\n", + "- Model: an algorithm for transforming vectors from one representation to another.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "document = 'Lorem ipsum dolor sit amet eheh 123 gelato'\n", + "\n", + "text_corpus = [\n", + " \"Human machine interface for lab abc computer applications\",\n", + " \"A survey of user opinion of computer system response time\",\n", + " \"The EPS user interface management system\",\n", + " \"System and human system engineering testing of EPS\",\n", + " \"Relation of user perceived response time to error measurement\",\n", + " \"The generation of random binary unordered trees\",\n", + " \"The intersection graph of paths in trees\",\n", + " \"Graph minors IV Widths of trees and well quasi ordering\",\n", + " \"Graph minors A survey\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Cleaning the corpus\n", + "\n", + "# Create a set of frequent words\n", + "stoplist = set('for a of the and to in'.split(' '))\n", + "\n", + "# Lowercase each document, split it by white space and filter out stopwords\n", + "texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "this next line seems crazy but it reads like:\n", + "- for every document in the list text_corpus do this:\n", + "- create a list of words by splitting the document \n", + "- and keep the word if it's not in the stoplist \n", + "\n", + "so the result should be a list of lists of words, one for each document " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['human', 'interface', 'computer'],\n", + " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", + " ['eps', 'user', 'interface', 'system'],\n", + " ['system', 'human', 'system', 'eps'],\n", + " ['user', 'response', 'time'],\n", + " ['trees'],\n", + " ['graph', 'trees'],\n", + " ['graph', 'minors', 'trees'],\n", + " ['graph', 'minors', 'survey']]\n" + ] + } + ], + "source": [ + "\n", + "# Count word frequencies\n", + "\n", + "# we are using defaultdict instead of a normal dictionary \n", + "# bc with this you can return a default value instead of an error if the key is missing in the dictionary\n", + "from collections import defaultdict\n", + "\n", + "frequency = defaultdict(int)\n", + "for text in texts:\n", + " for token in text:\n", + " frequency[token] += 1\n", + "\n", + "# Only keep words that appear more than once\n", + "\n", + "processed_corpus = [[token for token in text if frequency[token]>1] for text in texts]\n", + "pprint.pprint(processed_corpus)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n" + ] + } + ], + "source": [ + "# to associate each word with an unique integer ID we use the dictionary class provided by gensim. This dictionary defines the vocabulary of all words that our processing knows about.\n", + "\n", + "from gensim import corpora\n", + "\n", + "dictionary = corpora.Dictionary(processed_corpus)\n", + "print(dictionary)\n", + "# Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'computer': 0,\n", + " 'eps': 8,\n", + " 'graph': 10,\n", + " 'human': 1,\n", + " 'interface': 2,\n", + " 'minors': 11,\n", + " 'response': 3,\n", + " 'survey': 4,\n", + " 'system': 5,\n", + " 'time': 6,\n", + " 'trees': 9,\n", + " 'user': 7}\n" + ] + } + ], + "source": [ + "# print the id for each word\n", + "pprint.pprint(dictionary.token2id)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 1), (1, 1)]\n" + ] + } + ], + "source": [ + "# create a bag of word for a new document based on our corpus\n", + "new_doc = \"Human computer interaction\"\n", + "new_vec = dictionary.doc2bow(new_doc.lower().split())\n", + "print(new_vec)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[(0, 1), (1, 1), (2, 1)],\n", + " [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n", + " [(2, 1), (5, 1), (7, 1), (8, 1)],\n", + " [(1, 1), (5, 2), (8, 1)],\n", + " [(3, 1), (6, 1), (7, 1)],\n", + " [(9, 1)],\n", + " [(9, 1), (10, 1)],\n", + " [(9, 1), (10, 1), (11, 1)],\n", + " [(4, 1), (10, 1), (11, 1)]]\n" + ] + } + ], + "source": [ + "bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n", + "pprint.pprint(bow_corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can use models aka way to represent documents. One simple example of a model is the `tf-idf`. The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n" + ] + } + ], + "source": [ + "from gensim import models\n", + "\n", + "# train the model\n", + "tfidf = models.TfidfModel(bow_corpus)\n", + "\n", + "# transform the 'system minors' string\n", + "words = \"system minors\".lower().split()\n", + "print(tfidf[dictionary.doc2bow(words)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the model and later load them back, to continue training or transform new documents. So the training is something that could be done through time.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]\n" + ] + } + ], + "source": [ + "from gensim import similarities\n", + "\n", + "index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)\n", + "\n", + "query_document = 'system engineering'.lower().split()\n", + "query_bow = dictionary.doc2bow(query_document)\n", + "sims = index[tfidf[query_bow]]\n", + "print(list(enumerate(sims)))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3 0.7184812\n", + "2 0.41707572\n", + "1 0.32448703\n", + "0 0.0\n", + "4 0.0\n", + "5 0.0\n", + "6 0.0\n", + "7 0.0\n", + "8 0.0\n" + ] + } + ], + "source": [ + "# sorting the similarities by score\n", + "\n", + "for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n", + " print(document_number, score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97" + }, + "kernelspec": { + "display_name": "Python 3.10.2 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/gensim_core.py b/tutorials/gensim_core.py deleted file mode 100644 index 428365a..0000000 --- a/tutorials/gensim_core.py +++ /dev/null @@ -1,29 +0,0 @@ -# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py - -import pprint - -# Document: some text. -# Corpus: a collection of documents. -# Vector: a mathematically convenient representation of a document. -# Model: an algorithm for transforming vectors from one representation to another. - -document = 'Lorem ipsum dolor sit amet eheh 123 gelato' - -text_corpus = [ - "Human machine interface for lab abc computer applications", - "A survey of user opinion of computer system response time", - "The EPS user interface management system", - "System and human system engineering testing of EPS", - "Relation of user perceived response time to error measurement", - "The generation of random binary unordered trees", - "The intersection graph of paths in trees", - "Graph minors IV Widths of trees and well quasi ordering", - "Graph minors A survey", -] - -# Cleaning the corpus - -# Create a set of frequent words -stoplist = set('for a of the and to in'.split(' ')) - -# Lowercase each document, split it by white space and filter out stopwords diff --git a/tutorials/gensim_corpora_vector_space.ipynb b/tutorials/gensim_corpora_vector_space.ipynb new file mode 100644 index 0000000..06193ec --- /dev/null +++ b/tutorials/gensim_corpora_vector_space.ipynb @@ -0,0 +1,447 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Corpora and Vector Spaces" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Corpora and Vector Space tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# start from documents as strings\n", + "\n", + "documents = [\n", + " \"Human machine interface for lab abc computer applications\",\n", + " \"A survey of user opinion of computer system response time\",\n", + " \"The EPS user interface management system\",\n", + " \"System and human system engineering testing of EPS\",\n", + " \"Relation of user perceived response time to error measurement\",\n", + " \"The generation of random binary unordered trees\",\n", + " \"The intersection graph of paths in trees\",\n", + " \"Graph minors IV Widths of trees and well quasi ordering\",\n", + " \"Graph minors A survey\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['human', 'interface', 'computer'],\n", + " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", + " ['eps', 'user', 'interface', 'system'],\n", + " ['system', 'human', 'system', 'eps'],\n", + " ['user', 'response', 'time'],\n", + " ['trees'],\n", + " ['graph', 'trees'],\n", + " ['graph', 'minors', 'trees'],\n", + " ['graph', 'minors', 'survey']]\n" + ] + } + ], + "source": [ + "# tokenize the documents, remove common words using the stoplist as well as words that only appear once\n", + "\n", + "from pprint import pprint\n", + "from collections import defaultdict\n", + "\n", + "# remove common words and tokenize\n", + "\n", + "stoplist = set('for a of the and to in'.split())\n", + "texts = [\n", + " [word for word in document.lower().split() if word not in stoplist]\n", + " for document in documents\n", + "]\n", + "\n", + "# remove words that appear only once\n", + "\n", + "frequency = defaultdict(int)\n", + "for text in texts:\n", + " for token in text:\n", + " frequency[token] += 1\n", + "\n", + "texts = [\n", + " [token for token in text if frequency[token] > 1]\n", + " for text in texts\n", + "]\n", + "\n", + "pprint(texts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To convert documents to vectors, we’ll use a document representation called bag-of-words. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-23 15:36:12,400 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", + "2022-05-23 15:36:12,400 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n", + "2022-05-23 15:36:12,401 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-23T15:36:12.401796', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n", + "2022-05-23 15:36:12,401 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-05-23T15:36:12.401796', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}\n", + "2022-05-23 15:36:12,402 : INFO : saved /tmp/deerwester.dict\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n" + ] + } + ], + "source": [ + "from gensim import corpora\n", + "dictionary = corpora.Dictionary(texts)\n", + "dictionary.save('/tmp/deerwester.dict') # store the dictionary for future refefence\n", + "print(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-23 15:36:12,851 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm\n", + "2022-05-23 15:36:12,853 : INFO : saving sparse matrix to /tmp/deerwester.mm\n", + "2022-05-23 15:36:12,853 : INFO : PROGRESS: saving document #0\n", + "2022-05-23 15:36:12,854 : INFO : saved 9x12 matrix, density=25.926% (28/108)\n", + "2022-05-23 15:36:12,855 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]\n" + ] + } + ], + "source": [ + "corpus = [dictionary.doc2bow(text) for text in texts]\n", + "corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use\n", + "print(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Corpus Streaming - One Document at a Time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is usefull for working with big corpus, since they are not loaded entirely in memory at once. Instead with smart_open they can be loaded one document at a time" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from smart_open import open # for transparently opening remote files\n", + "\n", + "class MyCorpus:\n", + " def __iter__(self):\n", + " for line in open('https://radimrehurek.com/mycorpus.txt'):\n", + " # assume there's one document per line, tokens separated by whitespace\n", + " yield dictionary.doc2bow(line.lower().split())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "About this yield statement:\n", + "https://stackoverflow.com/a/231855" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "About this MyCorpus class:\n", + "\n", + "_The assumption that each document occupies one line in a single file is not important; you can mold the \\_\\_iter\\_\\_ function to fit your input format, whatever it is. Walking directories, parsing XML, accessing the network… Just parse your input to retrieve a clean list of tokens in each document, then convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside \\_\\_iter\\_\\_._" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<__main__.MyCorpus object at 0x000002362CFCA530>\n" + ] + } + ], + "source": [ + "corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!\n", + "print(corpus_memory_friendly)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 1), (1, 1), (2, 1)]\n", + "[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]\n", + "[(2, 1), (5, 1), (7, 1), (8, 1)]\n", + "[(1, 1), (5, 2), (8, 1)]\n", + "[(3, 1), (6, 1), (7, 1)]\n", + "[(9, 1)]\n", + "[(9, 1), (10, 1)]\n", + "[(9, 1), (10, 1), (11, 1)]\n", + "[(4, 1), (10, 1), (11, 1)]\n" + ] + } + ], + "source": [ + "for vector in corpus_memory_friendly: #Load one vector into memory at a time\n", + " print(vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although the output is the same as for the plain Python list, the corpus is now much more memory friendly, because at most one vector resides in RAM at a time. Your corpus can now be as large as you want." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-23 15:42:28,778 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", + "2022-05-23 15:42:28,779 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)\n", + "2022-05-23 15:42:28,779 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)\", 'datetime': '2022-05-23T15:42:28.779876', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n" + ] + } + ], + "source": [ + "# We can also construct dictionary without loading all texts into memory\n", + "\n", + "# collect statistics about all tokens\n", + "\n", + "dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))\n", + "\n", + "\n", + "\n", + "stop_ids = [ \n", + " dictionary.token2id[stopword]\n", + " for stopword in stoplist\n", + " if stopword in dictionary.token2id\n", + "]\n", + "\n", + "once_ids = [\n", + " tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1\n", + "]\n", + "\n", + "dictionary.filter_tokens(stop_ids + once_ids) # remove stopwords and words that appear only once\n", + "dictionary.compactify() # remove gaps in id sequence after words that were removed\n", + "print(dictionary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Corpus Formats" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There exist several file formats for serializing a Vector Space corpus (~sequence of vectors) to disk. Gensim implements them via the streaming corpus interface mentioned earlier: documents are read from (resp. stored to) disk in a lazy fashion, one document at a time, without the whole corpus being read into main memory at once.\n", + "\n", + "One of the more notable file formats is the Market Matrix format. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-23 15:50:56,704 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm\n", + "2022-05-23 15:50:56,705 : INFO : saving sparse matrix to /tmp/corpus.mm\n", + "2022-05-23 15:50:56,706 : INFO : PROGRESS: saving document #0\n", + "2022-05-23 15:50:56,707 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n", + "2022-05-23 15:50:56,708 : INFO : saving MmCorpus index to /tmp/corpus.mm.index\n", + "2022-05-23 15:50:56,711 : INFO : loaded corpus index from /tmp/corpus.mm.index\n", + "2022-05-23 15:50:56,711 : INFO : initializing cython corpus reader from /tmp/corpus.mm\n", + "2022-05-23 15:50:56,714 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n" + ] + } + ], + "source": [ + "corpus = [[(1, 0.5)], []] # two documents (one is empty!)\n", + "\n", + "\n", + "# To save\n", + "corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)\n", + "\n", + "# To load\n", + "corpus = corpora.MmCorpus('/tmp/corpus.mm')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MmCorpus(2 documents, 2 features, 1 non-zero entries)\n" + ] + } + ], + "source": [ + "# Corpus objects are streams, so typically you won’t be able to print them directly:\n", + "print(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[(1, 0.5)], []]\n" + ] + } + ], + "source": [ + "# one way of printing a corpus: load it entirely into memory\n", + "\n", + "print(list(corpus)) # calling list() will convert any sequence to a plain Python list" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(1, 0.5)]\n", + "[]\n" + ] + } + ], + "source": [ + "# another way of doing it: print one document at a time, making use of the streaming interface\n", + "# (more memory friendly)\n", + "for doc in corpus:\n", + " print(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97" + }, + "kernelspec": { + "display_name": "Python 3.10.2 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}