From d427ad65288557178ae66df62ee1528ac6070d35 Mon Sep 17 00:00:00 2001 From: Francesco Luzzana Date: Mon, 30 May 2022 11:38:08 +0200 Subject: [PATCH] two tutorials --- tutorials/gensim_similarity_queries.ipynb | 226 +++++++++++++++++++ tutorials/gensim_topic_transformations.ipynb | 98 +++++--- 2 files changed, 298 insertions(+), 26 deletions(-) create mode 100644 tutorials/gensim_similarity_queries.ipynb diff --git a/tutorials/gensim_similarity_queries.ipynb b/tutorials/gensim_similarity_queries.ipynb new file mode 100644 index 0000000..8746823 --- /dev/null +++ b/tutorials/gensim_similarity_queries.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Similarity Queries\n", + "[GENSIM tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-30 11:17:30,012 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", + "2022-05-30 11:17:30,013 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n", + "2022-05-30 11:17:30,014 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-30T11:17:30.014843', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" + ] + } + ], + "source": [ + "# Creating the corpus\n", + "\n", + "from collections import defaultdict\n", + "from gensim import corpora\n", + "\n", + "documents = [\n", + " \"Human machine interface for lab abc computer applications\",\n", + " \"A survey of user opinion of computer system response time\",\n", + " \"The EPS user interface management system\",\n", + " \"System and human system engineering testing of EPS\",\n", + " \"Relation of user perceived response time to error measurement\",\n", + " \"The generation of random binary unordered trees\",\n", + " \"The intersection graph of paths in trees\",\n", + " \"Graph minors IV Widths of trees and well quasi ordering\",\n", + " \"Graph minors A survey\",\n", + "]\n", + "\n", + "# remove common words and tokenize\n", + "\n", + "stoplist = set('for a of the and to in'.split())\n", + "texts = [\n", + " [word for word in document.lower().split() if word not in stoplist]\n", + " for document in documents\n", + "]\n", + "\n", + "# remove words that appear only once\n", + "frequency = defaultdict(int)\n", + "for text in texts:\n", + " for token in text:\n", + " frequency[token]+=1\n", + "\n", + "texts = [\n", + " [token for token in text if frequency[token]>1]\n", + " for text in texts\n", + "]\n", + "\n", + "dictionary = corpora.Dictionary(texts)\n", + "corpus = [dictionary.doc2bow(text) for text in texts]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-30 11:18:04,795 : INFO : using serial LSI version on this node\n", + "2022-05-30 11:18:04,796 : INFO : updating model with new documents\n", + "2022-05-30 11:18:04,797 : INFO : preparing a new chunk of documents\n", + "2022-05-30 11:18:04,798 : INFO : using 100 extra samples and 2 power iterations\n", + "2022-05-30 11:18:04,798 : INFO : 1st phase: constructing (12, 102) action matrix\n", + "2022-05-30 11:18:04,800 : INFO : orthonormalizing (12, 102) action matrix\n", + "2022-05-30 11:18:04,803 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n", + "2022-05-30 11:18:04,803 : INFO : computing the final decomposition\n", + "2022-05-30 11:18:04,804 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)\n", + "2022-05-30 11:18:04,804 : INFO : processed documents up to #9\n", + "2022-05-30 11:18:04,805 : INFO : topic #0(3.341): -0.644*\"system\" + -0.404*\"user\" + -0.301*\"eps\" + -0.265*\"response\" + -0.265*\"time\" + -0.240*\"computer\" + -0.221*\"human\" + -0.206*\"survey\" + -0.198*\"interface\" + -0.036*\"graph\"\n", + "2022-05-30 11:18:04,806 : INFO : topic #1(2.542): 0.623*\"graph\" + 0.490*\"trees\" + 0.451*\"minors\" + 0.274*\"survey\" + -0.167*\"system\" + -0.141*\"eps\" + -0.113*\"human\" + 0.107*\"response\" + 0.107*\"time\" + -0.072*\"interface\"\n", + "2022-05-30 11:18:04,806 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel in 0.01s', 'datetime': '2022-05-30T11:18:04.806885', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" + ] + } + ], + "source": [ + "from gensim import models\n", + "lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, -0.461821004532716), (1, -0.07002766527900031)]\n" + ] + } + ], + "source": [ + "# Prepare the query\n", + "\n", + "doc = \"Human computer interaction\"\n", + "vec_bow = dictionary.doc2bow(doc.lower().split())\n", + "vec_lsi = lsi[vec_bow] # convert the query to LSI space\n", + "print(vec_lsi)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-30 11:33:41,625 : WARNING : scanning corpus to determine the number of features (consider setting `num_features` explicitly)\n", + "2022-05-30 11:33:41,626 : INFO : creating matrix with 9 documents and 2 features\n" + ] + } + ], + "source": [ + "from gensim import similarities\n", + "index = similarities.MatrixSimilarity(lsi[corpus])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]\n" + ] + } + ], + "source": [ + "sims = index[vec_lsi]\n", + "print(list(enumerate(sims)))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9984453 The EPS user interface management system\n", + "0.998093 Human machine interface for lab abc computer applications\n", + "0.9865886 System and human system engineering testing of EPS\n", + "0.93748635 A survey of user opinion of computer system response time\n", + "0.90755945 Relation of user perceived response time to error measurement\n", + "0.050041765 Graph minors A survey\n", + "-0.09879464 Graph minors IV Widths of trees and well quasi ordering\n", + "-0.10639259 The intersection graph of paths in trees\n", + "-0.12416792 The generation of random binary unordered trees\n" + ] + } + ], + "source": [ + "sims = sorted(enumerate(sims), key=lambda item: -item[1])\n", + "for doc_position, doc_score in sims:\n", + " print(doc_score, documents[doc_position])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97" + }, + "kernelspec": { + "display_name": "Python 3.10.2 ('venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/gensim_topic_transformations.ipynb b/tutorials/gensim_topic_transformations.ipynb index 6140d4b..e367c3b 100644 --- a/tutorials/gensim_topic_transformations.ipynb +++ b/tutorials/gensim_topic_transformations.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Topics and Transformations\n", + "\n", + "[GENSIM tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html)" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -19,9 +28,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-23 18:15:06,938 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", - "2022-05-23 18:15:06,939 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n", - "2022-05-23 18:15:06,939 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-23T18:15:06.939467', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" + "2022-05-30 10:36:44,407 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", + "2022-05-30 10:36:44,407 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n", + "2022-05-30 10:36:44,407 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-30T10:36:44.407538', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" ] } ], @@ -72,9 +81,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-05-23 18:16:10,608 : INFO : collecting document frequencies\n", - "2022-05-23 18:16:10,609 : INFO : PROGRESS: processing document #0\n", - "2022-05-23 18:16:10,609 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-05-23T18:16:10.609938', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}\n" + "2022-05-30 10:36:46,911 : INFO : collecting document frequencies\n", + "2022-05-30 10:36:46,911 : INFO : PROGRESS: processing document #0\n", + "2022-05-30 10:36:46,915 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-05-30T10:36:46.915854', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}\n" ] } ], @@ -86,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -104,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -131,26 +140,26 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-05-23 18:40:30,354 : INFO : using serial LSI version on this node\n", - "2022-05-23 18:40:30,354 : INFO : updating model with new documents\n", - "2022-05-23 18:40:30,355 : INFO : preparing a new chunk of documents\n", - "2022-05-23 18:40:30,358 : INFO : using 100 extra samples and 2 power iterations\n", - "2022-05-23 18:40:30,359 : INFO : 1st phase: constructing (12, 102) action matrix\n", - "2022-05-23 18:40:30,360 : INFO : orthonormalizing (12, 102) action matrix\n", - "2022-05-23 18:40:30,362 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n", - "2022-05-23 18:40:30,363 : INFO : computing the final decomposition\n", - "2022-05-23 18:40:30,364 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)\n", - "2022-05-23 18:40:30,365 : INFO : processed documents up to #9\n", - "2022-05-23 18:40:30,365 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n", - "2022-05-23 18:40:30,365 : INFO : topic #1(1.476): 0.460*\"system\" + 0.373*\"user\" + 0.332*\"eps\" + 0.328*\"interface\" + 0.320*\"response\" + 0.320*\"time\" + 0.293*\"computer\" + 0.280*\"human\" + 0.171*\"survey\" + -0.161*\"trees\"\n", - "2022-05-23 18:40:30,365 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel in 0.01s', 'datetime': '2022-05-23T18:40:30.365269', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" + "2022-05-30 10:36:49,649 : INFO : using serial LSI version on this node\n", + "2022-05-30 10:36:49,653 : INFO : updating model with new documents\n", + "2022-05-30 10:36:49,653 : INFO : preparing a new chunk of documents\n", + "2022-05-30 10:36:49,653 : INFO : using 100 extra samples and 2 power iterations\n", + "2022-05-30 10:36:49,653 : INFO : 1st phase: constructing (12, 102) action matrix\n", + "2022-05-30 10:36:49,657 : INFO : orthonormalizing (12, 102) action matrix\n", + "2022-05-30 10:36:49,665 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n", + "2022-05-30 10:36:49,669 : INFO : computing the final decomposition\n", + "2022-05-30 10:36:49,669 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)\n", + "2022-05-30 10:36:49,673 : INFO : processed documents up to #9\n", + "2022-05-30 10:36:49,673 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n", + "2022-05-30 10:36:49,677 : INFO : topic #1(1.476): -0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"\n", + "2022-05-30 10:36:49,677 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel in 0.02s', 'datetime': '2022-05-30T10:36:49.677361', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" ] } ], @@ -161,22 +170,59 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-05-30 11:00:13,864 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n", + "2022-05-30 11:00:13,865 : INFO : topic #1(1.476): -0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"\n" + ] + }, { "data": { "text/plain": [ - "''" + "[(0,\n", + " '0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"'),\n", + " (1,\n", + " '-0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"')]" ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "lsi_model.print_topic(2)" + "lsi_model.print_topics(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, 0.06600783396090393), (1, -0.5200703306361851)] Human machine interface for lab abc computer applications\n", + "[(0, 0.19667592859142538), (1, -0.7609563167700045)] A survey of user opinion of computer system response time\n", + "[(0, 0.08992639972446498), (1, -0.7241860626752512)] The EPS user interface management system\n", + "[(0, 0.07585847652178208), (1, -0.6320551586003431)] System and human system engineering testing of EPS\n", + "[(0, 0.10150299184980154), (1, -0.5737308483002954)] Relation of user perceived response time to error measurement\n", + "[(0, 0.7032108939378313), (1, 0.16115180214025876)] The generation of random binary unordered trees\n", + "[(0, 0.877478767311983), (1, 0.16758906864659515)] The intersection graph of paths in trees\n", + "[(0, 0.9098624686818577), (1, 0.14086553628719123)] Graph minors IV Widths of trees and well quasi ordering\n", + "[(0, 0.6165825350569277), (1, -0.05392907566389303)] Graph minors A survey\n" + ] + } + ], + "source": [ + "for doc, as_text in zip(corpus_lsi, documents):\n", + " print(doc, as_text)" ] }, {