tutorials 1 and 2
parent
249c976532
commit
7691a59fa9
@ -0,0 +1,11 @@
|
||||
click==8.1.3
|
||||
colorama==0.4.4
|
||||
Cython==0.29.28
|
||||
gensim==4.2.0
|
||||
joblib==1.1.0
|
||||
nltk==3.7
|
||||
numpy==1.22.4
|
||||
regex==2022.4.24
|
||||
scipy==1.8.1
|
||||
smart-open==6.0.0
|
||||
tqdm==4.64.0
|
@ -0,0 +1,346 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[Gensim tutorial 01](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pprint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- Document: some text.\n",
|
||||
"- Corpus: a collection of documents.\n",
|
||||
"- Vector: a mathematically convenient representation of a document.\n",
|
||||
"- Model: an algorithm for transforming vectors from one representation to another.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document = 'Lorem ipsum dolor sit amet eheh 123 gelato'\n",
|
||||
"\n",
|
||||
"text_corpus = [\n",
|
||||
" \"Human machine interface for lab abc computer applications\",\n",
|
||||
" \"A survey of user opinion of computer system response time\",\n",
|
||||
" \"The EPS user interface management system\",\n",
|
||||
" \"System and human system engineering testing of EPS\",\n",
|
||||
" \"Relation of user perceived response time to error measurement\",\n",
|
||||
" \"The generation of random binary unordered trees\",\n",
|
||||
" \"The intersection graph of paths in trees\",\n",
|
||||
" \"Graph minors IV Widths of trees and well quasi ordering\",\n",
|
||||
" \"Graph minors A survey\",\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cleaning the corpus\n",
|
||||
"\n",
|
||||
"# Create a set of frequent words\n",
|
||||
"stoplist = set('for a of the and to in'.split(' '))\n",
|
||||
"\n",
|
||||
"# Lowercase each document, split it by white space and filter out stopwords\n",
|
||||
"texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"this next line seems crazy but it reads like:\n",
|
||||
"- for every document in the list text_corpus do this:\n",
|
||||
"- create a list of words by splitting the document \n",
|
||||
"- and keep the word if it's not in the stoplist \n",
|
||||
"\n",
|
||||
"so the result should be a list of lists of words, one for each document "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[['human', 'interface', 'computer'],\n",
|
||||
" ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
|
||||
" ['eps', 'user', 'interface', 'system'],\n",
|
||||
" ['system', 'human', 'system', 'eps'],\n",
|
||||
" ['user', 'response', 'time'],\n",
|
||||
" ['trees'],\n",
|
||||
" ['graph', 'trees'],\n",
|
||||
" ['graph', 'minors', 'trees'],\n",
|
||||
" ['graph', 'minors', 'survey']]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Count word frequencies\n",
|
||||
"\n",
|
||||
"# we are using defaultdict instead of a normal dictionary \n",
|
||||
"# bc with this you can return a default value instead of an error if the key is missing in the dictionary\n",
|
||||
"from collections import defaultdict\n",
|
||||
"\n",
|
||||
"frequency = defaultdict(int)\n",
|
||||
"for text in texts:\n",
|
||||
" for token in text:\n",
|
||||
" frequency[token] += 1\n",
|
||||
"\n",
|
||||
"# Only keep words that appear more than once\n",
|
||||
"\n",
|
||||
"processed_corpus = [[token for token in text if frequency[token]>1] for text in texts]\n",
|
||||
"pprint.pprint(processed_corpus)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# to associate each word with an unique integer ID we use the dictionary class provided by gensim. This dictionary defines the vocabulary of all words that our processing knows about.\n",
|
||||
"\n",
|
||||
"from gensim import corpora\n",
|
||||
"\n",
|
||||
"dictionary = corpora.Dictionary(processed_corpus)\n",
|
||||
"print(dictionary)\n",
|
||||
"# Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'computer': 0,\n",
|
||||
" 'eps': 8,\n",
|
||||
" 'graph': 10,\n",
|
||||
" 'human': 1,\n",
|
||||
" 'interface': 2,\n",
|
||||
" 'minors': 11,\n",
|
||||
" 'response': 3,\n",
|
||||
" 'survey': 4,\n",
|
||||
" 'system': 5,\n",
|
||||
" 'time': 6,\n",
|
||||
" 'trees': 9,\n",
|
||||
" 'user': 7}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# print the id for each word\n",
|
||||
"pprint.pprint(dictionary.token2id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(0, 1), (1, 1)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# create a bag of word for a new document based on our corpus\n",
|
||||
"new_doc = \"Human computer interaction\"\n",
|
||||
"new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
|
||||
"print(new_vec)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The first entry in each tuple corresponds to the ID of the token in the dictionary, the second corresponds to the count of this token."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[(0, 1), (1, 1), (2, 1)],\n",
|
||||
" [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n",
|
||||
" [(2, 1), (5, 1), (7, 1), (8, 1)],\n",
|
||||
" [(1, 1), (5, 2), (8, 1)],\n",
|
||||
" [(3, 1), (6, 1), (7, 1)],\n",
|
||||
" [(9, 1)],\n",
|
||||
" [(9, 1), (10, 1)],\n",
|
||||
" [(9, 1), (10, 1), (11, 1)],\n",
|
||||
" [(4, 1), (10, 1), (11, 1)]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
|
||||
"pprint.pprint(bow_corpus)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we can use models aka way to represent documents. One simple example of a model is the `tf-idf`. The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from gensim import models\n",
|
||||
"\n",
|
||||
"# train the model\n",
|
||||
"tfidf = models.TfidfModel(bow_corpus)\n",
|
||||
"\n",
|
||||
"# transform the 'system minors' string\n",
|
||||
"words = \"system minors\".lower().split()\n",
|
||||
"print(tfidf[dictionary.doc2bow(words)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can save the model and later load them back, to continue training or transform new documents. So the training is something that could be done through time.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from gensim import similarities\n",
|
||||
"\n",
|
||||
"index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)\n",
|
||||
"\n",
|
||||
"query_document = 'system engineering'.lower().split()\n",
|
||||
"query_bow = dictionary.doc2bow(query_document)\n",
|
||||
"sims = index[tfidf[query_bow]]\n",
|
||||
"print(list(enumerate(sims)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3 0.7184812\n",
|
||||
"2 0.41707572\n",
|
||||
"1 0.32448703\n",
|
||||
"0 0.0\n",
|
||||
"4 0.0\n",
|
||||
"5 0.0\n",
|
||||
"6 0.0\n",
|
||||
"7 0.0\n",
|
||||
"8 0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# sorting the similarities by score\n",
|
||||
"\n",
|
||||
"for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
|
||||
" print(document_number, score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "a991b7e5a58af45663279ce1606e861d35361e78ec04a120e3cc987f7e474d97"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.10.2 ('venv': venv)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.2"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,29 +0,0 @@
|
||||
# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
|
||||
|
||||
import pprint
|
||||
|
||||
# Document: some text.
|
||||
# Corpus: a collection of documents.
|
||||
# Vector: a mathematically convenient representation of a document.
|
||||
# Model: an algorithm for transforming vectors from one representation to another.
|
||||
|
||||
document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
|
||||
|
||||
text_corpus = [
|
||||
"Human machine interface for lab abc computer applications",
|
||||
"A survey of user opinion of computer system response time",
|
||||
"The EPS user interface management system",
|
||||
"System and human system engineering testing of EPS",
|
||||
"Relation of user perceived response time to error measurement",
|
||||
"The generation of random binary unordered trees",
|
||||
"The intersection graph of paths in trees",
|
||||
"Graph minors IV Widths of trees and well quasi ordering",
|
||||
"Graph minors A survey",
|
||||
]
|
||||
|
||||
# Cleaning the corpus
|
||||
|
||||
# Create a set of frequent words
|
||||
stoplist = set('for a of the and to in'.split(' '))
|
||||
|
||||
# Lowercase each document, split it by white space and filter out stopwords
|
Loading…
Reference in New Issue