You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
glueberry/tutorials/gensim_similarity_queries.i...

227 lines
7.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Similarity Queries\n",
"[GENSIM tutorial](https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html#sphx-glr-auto-examples-core-run-similarity-queries-py)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-30 11:17:30,012 : INFO : adding document #0 to Dictionary<0 unique tokens: []>\n",
"2022-05-30 11:17:30,013 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\n",
"2022-05-30 11:17:30,014 : INFO : Dictionary lifecycle event {'msg': \"built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)\", 'datetime': '2022-05-30T11:17:30.014843', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"
]
}
],
"source": [
"# Creating the corpus\n",
"\n",
"from collections import defaultdict\n",
"from gensim import corpora\n",
"\n",
"documents = [\n",
" \"Human machine interface for lab abc computer applications\",\n",
" \"A survey of user opinion of computer system response time\",\n",
" \"The EPS user interface management system\",\n",
" \"System and human system engineering testing of EPS\",\n",
" \"Relation of user perceived response time to error measurement\",\n",
" \"The generation of random binary unordered trees\",\n",
" \"The intersection graph of paths in trees\",\n",
" \"Graph minors IV Widths of trees and well quasi ordering\",\n",
" \"Graph minors A survey\",\n",
"]\n",
"\n",
"# remove common words and tokenize\n",
"\n",
"stoplist = set('for a of the and to in'.split())\n",
"texts = [\n",
" [word for word in document.lower().split() if word not in stoplist]\n",
" for document in documents\n",
"]\n",
"\n",
"# remove words that appear only once\n",
"frequency = defaultdict(int)\n",
"for text in texts:\n",
" for token in text:\n",
" frequency[token]+=1\n",
"\n",
"texts = [\n",
" [token for token in text if frequency[token]>1]\n",
" for text in texts\n",
"]\n",
"\n",
"dictionary = corpora.Dictionary(texts)\n",
"corpus = [dictionary.doc2bow(text) for text in texts]\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-30 11:18:04,795 : INFO : using serial LSI version on this node\n",
"2022-05-30 11:18:04,796 : INFO : updating model with new documents\n",
"2022-05-30 11:18:04,797 : INFO : preparing a new chunk of documents\n",
"2022-05-30 11:18:04,798 : INFO : using 100 extra samples and 2 power iterations\n",
"2022-05-30 11:18:04,798 : INFO : 1st phase: constructing (12, 102) action matrix\n",
"2022-05-30 11:18:04,800 : INFO : orthonormalizing (12, 102) action matrix\n",
"2022-05-30 11:18:04,803 : INFO : 2nd phase: running dense svd on (12, 9) matrix\n",
"2022-05-30 11:18:04,803 : INFO : computing the final decomposition\n",
"2022-05-30 11:18:04,804 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)\n",
"2022-05-30 11:18:04,804 : INFO : processed documents up to #9\n",
"2022-05-30 11:18:04,805 : INFO : topic #0(3.341): -0.644*\"system\" + -0.404*\"user\" + -0.301*\"eps\" + -0.265*\"response\" + -0.265*\"time\" + -0.240*\"computer\" + -0.221*\"human\" + -0.206*\"survey\" + -0.198*\"interface\" + -0.036*\"graph\"\n",
"2022-05-30 11:18:04,806 : INFO : topic #1(2.542): 0.623*\"graph\" + 0.490*\"trees\" + 0.451*\"minors\" + 0.274*\"survey\" + -0.167*\"system\" + -0.141*\"eps\" + -0.113*\"human\" + 0.107*\"response\" + 0.107*\"time\" + -0.072*\"interface\"\n",
"2022-05-30 11:18:04,806 : INFO : LsiModel lifecycle event {'msg': 'trained LsiModel<num_terms=12, num_topics=2, decay=1.0, chunksize=20000> in 0.01s', 'datetime': '2022-05-30T11:18:04.806885', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}\n"
]
}
],
"source": [
"from gensim import models\n",
"lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, -0.461821004532716), (1, -0.07002766527900031)]\n"
]
}
],
"source": [
"# Prepare the query\n",
"\n",
"doc = \"Human computer interaction\"\n",
"vec_bow = dictionary.doc2bow(doc.lower().split())\n",
"vec_lsi = lsi[vec_bow] # convert the query to LSI space\n",
"print(vec_lsi)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-05-30 11:33:41,625 : WARNING : scanning corpus to determine the number of features (consider setting `num_features` explicitly)\n",
"2022-05-30 11:33:41,626 : INFO : creating matrix with 9 documents and 2 features\n"
]
}
],
"source": [
"from gensim import similarities\n",
"index = similarities.MatrixSimilarity(lsi[corpus])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]\n"
]
}
],
"source": [
"sims = index[vec_lsi]\n",
"print(list(enumerate(sims)))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9984453 The EPS user interface management system\n",
"0.998093 Human machine interface for lab abc computer applications\n",
"0.9865886 System and human system engineering testing of EPS\n",
"0.93748635 A survey of user opinion of computer system response time\n",
"0.90755945 Relation of user perceived response time to error measurement\n",
"0.050041765 Graph minors A survey\n",
"-0.09879464 Graph minors IV Widths of trees and well quasi ordering\n",
"-0.10639259 The intersection graph of paths in trees\n",
"-0.12416792 The generation of random binary unordered trees\n"
]
}
],
"source": [
"sims = sorted(enumerate(sims), key=lambda item: -item[1])\n",
"for doc_position, doc_score in sims:\n",
" print(doc_score, documents[doc_position])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
},
"kernelspec": {
"display_name": "Python 3.10.2 ('venv': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}