You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
347 lines
9.2 KiB
Plaintext
347 lines
9.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"[Gensim tutorial 01](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pprint"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"- Document: some text.\n",
|
|
"- Corpus: a collection of documents.\n",
|
|
"- Vector: a mathematically convenient representation of a document.\n",
|
|
"- Model: an algorithm for transforming vectors from one representation to another.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"document = 'Lorem ipsum dolor sit amet eheh 123 gelato'\n",
|
|
"\n",
|
|
"text_corpus = [\n",
|
|
" \"Human machine interface for lab abc computer applications\",\n",
|
|
" \"A survey of user opinion of computer system response time\",\n",
|
|
" \"The EPS user interface management system\",\n",
|
|
" \"System and human system engineering testing of EPS\",\n",
|
|
" \"Relation of user perceived response time to error measurement\",\n",
|
|
" \"The generation of random binary unordered trees\",\n",
|
|
" \"The intersection graph of paths in trees\",\n",
|
|
" \"Graph minors IV Widths of trees and well quasi ordering\",\n",
|
|
" \"Graph minors A survey\",\n",
|
|
"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Cleaning the corpus\n",
|
|
"\n",
|
|
"# Create a set of frequent words\n",
|
|
"stoplist = set('for a of the and to in'.split(' '))\n",
|
|
"\n",
|
|
"# Lowercase each document, split it by white space and filter out stopwords\n",
|
|
"texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"this next line seems crazy but it reads like:\n",
|
|
"- for every document in the list text_corpus do this:\n",
|
|
"- create a list of words by splitting the document \n",
|
|
"- and keep the word if it's not in the stoplist \n",
|
|
"\n",
|
|
"so the result should be a list of lists of words, one for each document "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[['human', 'interface', 'computer'],\n",
|
|
" ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
|
|
" ['eps', 'user', 'interface', 'system'],\n",
|
|
" ['system', 'human', 'system', 'eps'],\n",
|
|
" ['user', 'response', 'time'],\n",
|
|
" ['trees'],\n",
|
|
" ['graph', 'trees'],\n",
|
|
" ['graph', 'minors', 'trees'],\n",
|
|
" ['graph', 'minors', 'survey']]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"# Count word frequencies\n",
|
|
"\n",
|
|
"# we are using defaultdict instead of a normal dictionary \n",
|
|
"# bc with this you can return a default value instead of an error if the key is missing in the dictionary\n",
|
|
"from collections import defaultdict\n",
|
|
"\n",
|
|
"frequency = defaultdict(int)\n",
|
|
"for text in texts:\n",
|
|
" for token in text:\n",
|
|
" frequency[token] += 1\n",
|
|
"\n",
|
|
"# Only keep words that appear more than once\n",
|
|
"\n",
|
|
"processed_corpus = [[token for token in text if frequency[token]>1] for text in texts]\n",
|
|
"pprint.pprint(processed_corpus)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# to associate each word with an unique integer ID we use the dictionary class provided by gensim. This dictionary defines the vocabulary of all words that our processing knows about.\n",
|
|
"\n",
|
|
"from gensim import corpora\n",
|
|
"\n",
|
|
"dictionary = corpora.Dictionary(processed_corpus)\n",
|
|
"print(dictionary)\n",
|
|
"# Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'computer': 0,\n",
|
|
" 'eps': 8,\n",
|
|
" 'graph': 10,\n",
|
|
" 'human': 1,\n",
|
|
" 'interface': 2,\n",
|
|
" 'minors': 11,\n",
|
|
" 'response': 3,\n",
|
|
" 'survey': 4,\n",
|
|
" 'system': 5,\n",
|
|
" 'time': 6,\n",
|
|
" 'trees': 9,\n",
|
|
" 'user': 7}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# print the id for each word\n",
|
|
"pprint.pprint(dictionary.token2id)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[(0, 1), (1, 1)]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# create a bag of word for a new document based on our corpus\n",
|
|
"new_doc = \"Human computer interaction\"\n",
|
|
"new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
|
|
"print(new_vec)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[(0, 1), (1, 1), (2, 1)],\n",
|
|
" [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n",
|
|
" [(2, 1), (5, 1), (7, 1), (8, 1)],\n",
|
|
" [(1, 1), (5, 2), (8, 1)],\n",
|
|
" [(3, 1), (6, 1), (7, 1)],\n",
|
|
" [(9, 1)],\n",
|
|
" [(9, 1), (10, 1)],\n",
|
|
" [(9, 1), (10, 1), (11, 1)],\n",
|
|
" [(4, 1), (10, 1), (11, 1)]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
|
|
"pprint.pprint(bow_corpus)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now we can use models aka way to represent documents. One simple example of a model is the `tf-idf`. The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from gensim import models\n",
|
|
"\n",
|
|
"# train the model\n",
|
|
"tfidf = models.TfidfModel(bow_corpus)\n",
|
|
"\n",
|
|
"# transform the 'system minors' string\n",
|
|
"words = \"system minors\".lower().split()\n",
|
|
"print(tfidf[dictionary.doc2bow(words)])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"We can save the model and later load them back, to continue training or transform new documents. So the training is something that could be done through time.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from gensim import similarities\n",
|
|
"\n",
|
|
"index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)\n",
|
|
"\n",
|
|
"query_document = 'system engineering'.lower().split()\n",
|
|
"query_bow = dictionary.doc2bow(query_document)\n",
|
|
"sims = index[tfidf[query_bow]]\n",
|
|
"print(list(enumerate(sims)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"3 0.7184812\n",
|
|
"2 0.41707572\n",
|
|
"1 0.32448703\n",
|
|
"0 0.0\n",
|
|
"4 0.0\n",
|
|
"5 0.0\n",
|
|
"6 0.0\n",
|
|
"7 0.0\n",
|
|
"8 0.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# sorting the similarities by score\n",
|
|
"\n",
|
|
"for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
|
|
" print(document_number, score)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"interpreter": {
|
|
"hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3.10.2 ('venv': venv)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.2"
|
|
},
|
|
"orig_nbformat": 4
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|