{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Topics and Transformations\n", "\n", "[GENSIM tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import logging\n", "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-05-30 10:36:44,407 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n", "2022-05-30 10:36:44,407 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n", "2022-05-30 10:36:44,407 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-30T10:36:44.407538', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" ] } ], "source": [ "from collections import defaultdict\n", "from gensim import corpora\n", "\n", "documents = [\n", " \"Human machine interface for lab abc computer applications\",\n", " \"A survey of user opinion of computer system response time\",\n", " \"The EPS user interface management system\",\n", " \"System and human system engineering testing of EPS\",\n", " \"Relation of user perceived response time to error measurement\",\n", " \"The generation of random binary unordered trees\",\n", " \"The intersection graph of paths in trees\",\n", " \"Graph minors IV Widths of trees and well quasi ordering\",\n", " \"Graph minors A survey\",\n", "]\n", "\n", "# remove common words and tokenize\n", "stoplist = set('for a of the and to in'.split())\n", "texts = [\n", " [word for word in document.lower().split() if word not in stoplist]\n", " for document in documents\n", "]\n", "\n", "# remove words that appear only once\n", "frequency = defaultdict(int)\n", "for text in texts:\n", " for token in text:\n", " frequency[token] += 1\n", "\n", "texts = [\n", " [token for token in text if frequency[token] > 1]\n", " for text in texts\n", "]\n", "\n", "dictionary = corpora.Dictionary(texts)\n", "corpus = [dictionary.doc2bow(text) for text in texts]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-05-30 10:36:46,911 : INFO : collecting document frequencies\n", "2022-05-30 10:36:46,911 : INFO : PROGRESS: processing document #0\n", "2022-05-30 10:36:46,915 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-05-30T10:36:46.915854', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}\n" ] } ], "source": [ "from gensim import models\n", "\n", "tfidf = models.TfidfModel(corpus) # 1. initialize the model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.7071067811865476), (1, 0.7071067811865476)]\n" ] } ], "source": [ "doc_bow = [(0,1), (1,1)]\n", "print(tfidf[doc_bow]) # 2. use the model to transform vectors" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]\n", "[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]\n", "[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]\n", "[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]\n", "[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]\n", "[(9, 1.0)]\n", "[(9, 0.7071067811865475), (10, 0.7071067811865475)]\n", "[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]\n", "[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]\n" ] } ], "source": [ "corpus_tfidf = tfidf[corpus]\n", "for doc in corpus_tfidf:\n", " print(doc)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-05-30 10:36:49,649 : INFO : using serial LSI version on this node\n", "2022-05-30 10:36:49,653 : INFO : updating model with new documents\n", "2022-05-30 10:36:49,653 : INFO : preparing a new chunk of documents\n", "2022-05-30 10:36:49,653 : INFO : using 100 extra samples and 2 power iterations\n", "2022-05-30 10:36:49,653 : INFO : 1st phase: constructing (12, 102) action matrix\n", "2022-05-30 10:36:49,657 : INFO : orthonormalizing (12, 102) action matrix\n", "2022-05-30 10:36:49,665 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n", "2022-05-30 10:36:49,669 : INFO : computing the final decomposition\n", "2022-05-30 10:36:49,669 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)\n", "2022-05-30 10:36:49,673 : INFO : processed documents up to #9\n", "2022-05-30 10:36:49,673 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n", "2022-05-30 10:36:49,677 : INFO : topic #1(1.476): -0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"\n", "2022-05-30 10:36:49,677 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel in 0.02s', 'datetime': '2022-05-30T10:36:49.677361', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n" ] } ], "source": [ "lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transf\n", "corpus_lsi = lsi_model[corpus_tfidf] # create a double wrapper over the original" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-05-30 11:00:13,864 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n", "2022-05-30 11:00:13,865 : INFO : topic #1(1.476): -0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"\n" ] }, { "data": { "text/plain": [ "[(0,\n", " '0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"'),\n", " (1,\n", " '-0.460*\"system\" + -0.373*\"user\" + -0.332*\"eps\" + -0.328*\"interface\" + -0.320*\"time\" + -0.320*\"response\" + -0.293*\"computer\" + -0.280*\"human\" + -0.171*\"survey\" + 0.161*\"trees\"')]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lsi_model.print_topics(2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0, 0.06600783396090393), (1, -0.5200703306361851)] Human machine interface for lab abc computer applications\n", "[(0, 0.19667592859142538), (1, -0.7609563167700045)] A survey of user opinion of computer system response time\n", "[(0, 0.08992639972446498), (1, -0.7241860626752512)] The EPS user interface management system\n", "[(0, 0.07585847652178208), (1, -0.6320551586003431)] System and human system engineering testing of EPS\n", "[(0, 0.10150299184980154), (1, -0.5737308483002954)] Relation of user perceived response time to error measurement\n", "[(0, 0.7032108939378313), (1, 0.16115180214025876)] The generation of random binary unordered trees\n", "[(0, 0.877478767311983), (1, 0.16758906864659515)] The intersection graph of paths in trees\n", "[(0, 0.9098624686818577), (1, 0.14086553628719123)] Graph minors IV Widths of trees and well quasi ordering\n", "[(0, 0.6165825350569277), (1, -0.05392907566389303)] Graph minors A survey\n" ] } ], "source": [ "for doc, as_text in zip(corpus_lsi, documents):\n", " print(doc, as_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97" }, "kernelspec": { "display_name": "Python 3.10.2 ('venv': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }