first tests
commit
9b34554467
@ -0,0 +1 @@
|
|||||||
|
venv/
|
@ -0,0 +1,29 @@
|
|||||||
|
# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
|
||||||
|
|
||||||
|
import pprint
|
||||||
|
|
||||||
|
# Document: some text.
|
||||||
|
# Corpus: a collection of documents.
|
||||||
|
# Vector: a mathematically convenient representation of a document.
|
||||||
|
# Model: an algorithm for transforming vectors from one representation to another.
|
||||||
|
|
||||||
|
document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
|
||||||
|
|
||||||
|
text_corpus = [
|
||||||
|
"Human machine interface for lab abc computer applications",
|
||||||
|
"A survey of user opinion of computer system response time",
|
||||||
|
"The EPS user interface management system",
|
||||||
|
"System and human system engineering testing of EPS",
|
||||||
|
"Relation of user perceived response time to error measurement",
|
||||||
|
"The generation of random binary unordered trees",
|
||||||
|
"The intersection graph of paths in trees",
|
||||||
|
"Graph minors IV Widths of trees and well quasi ordering",
|
||||||
|
"Graph minors A survey",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Cleaning the corpus
|
||||||
|
|
||||||
|
# Create a set of frequent words
|
||||||
|
stoplist = set('for a of the and to in'.split(' '))
|
||||||
|
|
||||||
|
# Lowercase each document, split it by white space and filter out stopwords
|
@ -0,0 +1,49 @@
|
|||||||
|
# Importing the modules
|
||||||
|
|
||||||
|
# import nltk
|
||||||
|
# nltk.download('punkt')
|
||||||
|
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.filterwarnings(action='ignore')
|
||||||
|
|
||||||
|
import gensim
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
|
||||||
|
|
||||||
|
# read the 11-0.txt file
|
||||||
|
with open('11-0.txt', 'r') as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# replace escape char with space
|
||||||
|
f = text.replace('\n', ' ')
|
||||||
|
|
||||||
|
data = []
|
||||||
|
|
||||||
|
# iterate through each sentence in the file
|
||||||
|
for i in sent_tokenize(f):
|
||||||
|
temp = []
|
||||||
|
|
||||||
|
for j in word_tokenize(i):
|
||||||
|
temp.append(j.lower())
|
||||||
|
|
||||||
|
data.append(temp)
|
||||||
|
|
||||||
|
# create the CBOW model
|
||||||
|
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
|
||||||
|
|
||||||
|
# print the results
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
|
||||||
|
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# create the Skip Gram model
|
||||||
|
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
|
||||||
|
|
||||||
|
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
|
||||||
|
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
|
||||||
|
|
Loading…
Reference in New Issue