glueberry/tutorials/gensim_topic_transformation...

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-23 18:15:06,938 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n",
      "2022-05-23 18:15:06,939 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n",
      "2022-05-23 18:15:06,939 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-23T18:15:06.939467', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "from gensim import corpora\n",
    "\n",
    "documents = [\n",
    "    \"Human machine interface for lab abc computer applications\",\n",
    "    \"A survey of user opinion of computer system response time\",\n",
    "    \"The EPS user interface management system\",\n",
    "    \"System and human system engineering testing of EPS\",\n",
    "    \"Relation of user perceived response time to error measurement\",\n",
    "    \"The generation of random binary unordered trees\",\n",
    "    \"The intersection graph of paths in trees\",\n",
    "    \"Graph minors IV Widths of trees and well quasi ordering\",\n",
    "    \"Graph minors A survey\",\n",
    "]\n",
    "\n",
    "# remove common words and tokenize\n",
    "stoplist = set('for a of the and to in'.split())\n",
    "texts = [\n",
    "    [word for word in document.lower().split() if word not in stoplist]\n",
    "    for document in documents\n",
    "]\n",
    "\n",
    "# remove words that appear only once\n",
    "frequency = defaultdict(int)\n",
    "for text in texts:\n",
    "    for token in text:\n",
    "        frequency[token] += 1\n",
    "\n",
    "texts = [\n",
    "    [token for token in text if frequency[token] > 1]\n",
    "    for text in texts\n",
    "]\n",
    "\n",
    "dictionary = corpora.Dictionary(texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-23 18:16:10,608 : INFO : collecting document frequencies\n",
      "2022-05-23 18:16:10,609 : INFO : PROGRESS: processing document #0\n",
      "2022-05-23 18:16:10,609 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-05-23T18:16:10.609938', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}\n"
     ]
    }
   ],
   "source": [
    "from gensim import models\n",
    "\n",
    "tfidf = models.TfidfModel(corpus) # 1. initialize the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.7071067811865476), (1, 0.7071067811865476)]\n"
     ]
    }
   ],
   "source": [
    "doc_bow = [(0,1), (1,1)]\n",
    "print(tfidf[doc_bow]) # 2. use the model to transform vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]\n",
      "[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]\n",
      "[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]\n",
      "[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]\n",
      "[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]\n",
      "[(9, 1.0)]\n",
      "[(9, 0.7071067811865475), (10, 0.7071067811865475)]\n",
      "[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]\n",
      "[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]\n"
     ]
    }
   ],
   "source": [
    "corpus_tfidf = tfidf[corpus]\n",
    "for doc in corpus_tfidf:\n",
    "    print(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-05-23 18:40:30,354 : INFO : using serial LSI version on this node\n",
      "2022-05-23 18:40:30,354 : INFO : updating model with new documents\n",
      "2022-05-23 18:40:30,355 : INFO : preparing a new chunk of documents\n",
      "2022-05-23 18:40:30,358 : INFO : using 100 extra samples and 2 power iterations\n",
      "2022-05-23 18:40:30,359 : INFO : 1st phase: constructing (12, 102) action matrix\n",
      "2022-05-23 18:40:30,360 : INFO : orthonormalizing (12, 102) action matrix\n",
      "2022-05-23 18:40:30,362 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n",
      "2022-05-23 18:40:30,363 : INFO : computing the final decomposition\n",
      "2022-05-23 18:40:30,364 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)\n",
      "2022-05-23 18:40:30,365 : INFO : processed documents up to #9\n",
      "2022-05-23 18:40:30,365 : INFO : topic #0(1.594): 0.703*\"trees\" + 0.538*\"graph\" + 0.402*\"minors\" + 0.187*\"survey\" + 0.061*\"system\" + 0.060*\"time\" + 0.060*\"response\" + 0.058*\"user\" + 0.049*\"computer\" + 0.035*\"interface\"\n",
      "2022-05-23 18:40:30,365 : INFO : topic #1(1.476): 0.460*\"system\" + 0.373*\"user\" + 0.332*\"eps\" + 0.328*\"interface\" + 0.320*\"response\" + 0.320*\"time\" + 0.293*\"computer\" + 0.280*\"human\" + 0.171*\"survey\" + -0.161*\"trees\"\n",
      "2022-05-23 18:40:30,365 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel<num_terms=12, num_topics=2, decay=1.0, chunksize=20000> in 0.01s', 'datetime': '2022-05-23T18:40:30.365269', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"
     ]
    }
   ],
   "source": [
    "lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transf\n",
    "corpus_lsi = lsi_model[corpus_tfidf] # create a double wrapper over the original"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lsi_model.print_topic(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
  },
  "kernelspec": {
   "display_name": "Python 3.10.2 ('venv': venv)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
tut 3 3 years ago			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import logging\n",`
			`"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO )"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"2022-05-23 18:15:06,938 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n",`
			`"2022-05-23 18:15:06,939 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n",`
			`"2022-05-23 18:15:06,939 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-23T18:15:06.939467', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from collections import defaultdict\n",`
			`"from gensim import corpora\n",`
			`"\n",`
			`"documents = [\n",`
			`" \"Human machine interface for lab abc computer applications\",\n",`
			`" \"A survey of user opinion of computer system response time\",\n",`
			`" \"The EPS user interface management system\",\n",`
			`" \"System and human system engineering testing of EPS\",\n",`
			`" \"Relation of user perceived response time to error measurement\",\n",`
			`" \"The generation of random binary unordered trees\",\n",`
			`" \"The intersection graph of paths in trees\",\n",`
			`" \"Graph minors IV Widths of trees and well quasi ordering\",\n",`
			`" \"Graph minors A survey\",\n",`
			`"]\n",`
			`"\n",`
			`"# remove common words and tokenize\n",`
			`"stoplist = set('for a of the and to in'.split())\n",`
			`"texts = [\n",`
			`" [word for word in document.lower().split() if word not in stoplist]\n",`
			`" for document in documents\n",`
			`"]\n",`
			`"\n",`
			`"# remove words that appear only once\n",`
			`"frequency = defaultdict(int)\n",`
			`"for text in texts:\n",`
			`" for token in text:\n",`
			`" frequency[token] += 1\n",`
			`"\n",`
			`"texts = [\n",`
			`" [token for token in text if frequency[token] > 1]\n",`
			`" for text in texts\n",`
			`"]\n",`
			`"\n",`
			`"dictionary = corpora.Dictionary(texts)\n",`
			`"corpus = [dictionary.doc2bow(text) for text in texts]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"2022-05-23 18:16:10,608 : INFO : collecting document frequencies\n",`
			`"2022-05-23 18:16:10,609 : INFO : PROGRESS: processing document #0\n",`
			`"2022-05-23 18:16:10,609 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9 documents and 12 features (28 matrix non-zeros)', 'datetime': '2022-05-23T18:16:10.609938', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"from gensim import models\n",`
			`"\n",`
			`"tfidf = models.TfidfModel(corpus) # 1. initialize the model"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"[(0, 0.7071067811865476), (1, 0.7071067811865476)]\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"doc_bow = [(0,1), (1,1)]\n",`
			`"print(tfidf[doc_bow]) # 2. use the model to transform vectors"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
			`"[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]\n",`
			`"[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]\n",`
			`"[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]\n",`
			`"[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]\n",`
			`"[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]\n",`
			`"[(9, 1.0)]\n",`
			`"[(9, 0.7071067811865475), (10, 0.7071067811865475)]\n",`
			`"[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]\n",`
			`"[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"corpus_tfidf = tfidf[corpus]\n",`
			`"for doc in corpus_tfidf:\n",`
			`" print(doc)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"2022-05-23 18:40:30,354 : INFO : using serial LSI version on this node\n",`
			`"2022-05-23 18:40:30,354 : INFO : updating model with new documents\n",`
			`"2022-05-23 18:40:30,355 : INFO : preparing a new chunk of documents\n",`
			`"2022-05-23 18:40:30,358 : INFO : using 100 extra samples and 2 power iterations\n",`
			`"2022-05-23 18:40:30,359 : INFO : 1st phase: constructing (12, 102) action matrix\n",`
			`"2022-05-23 18:40:30,360 : INFO : orthonormalizing (12, 102) action matrix\n",`
			`"2022-05-23 18:40:30,362 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n",`
			`"2022-05-23 18:40:30,363 : INFO : computing the final decomposition\n",`
			`"2022-05-23 18:40:30,364 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)\n",`
			`"2022-05-23 18:40:30,365 : INFO : processed documents up to #9\n",`
			`"2022-05-23 18:40:30,365 : INFO : topic #0(1.594): 0.703\"trees\" + 0.538\"graph\" + 0.402\"minors\" + 0.187\"survey\" + 0.061\"system\" + 0.060\"time\" + 0.060\"response\" + 0.058\"user\" + 0.049\"computer\" + 0.035\"interface\"\n",`
			`"2022-05-23 18:40:30,365 : INFO : topic #1(1.476): 0.460\"system\" + 0.373\"user\" + 0.332\"eps\" + 0.328\"interface\" + 0.320\"response\" + 0.320\"time\" + 0.293\"computer\" + 0.280\"human\" + 0.171\"survey\" + -0.161\"trees\"\n",`
			`"2022-05-23 18:40:30,365 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel<num_terms=12, num_topics=2, decay=1.0, chunksize=20000> in 0.01s', 'datetime': '2022-05-23T18:40:30.365269', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transf\n",`
			`"corpus_lsi = lsi_model[corpus_tfidf] # create a double wrapper over the original"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 12,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"''"`
			`]`
			`},`
			`"execution_count": 12,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"lsi_model.print_topic(2)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"interpreter": {`
			`"hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"`
			`},`
			`"kernelspec": {`
			`"display_name": "Python 3.10.2 ('venv': venv)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.10.2"`
			`},`
			`"orig_nbformat": 4`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`