first tests
commit
9b34554467
@ -0,0 +1 @@
|
||||
venv/
|
@ -0,0 +1,29 @@
|
||||
# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
|
||||
|
||||
import pprint
|
||||
|
||||
# Document: some text.
|
||||
# Corpus: a collection of documents.
|
||||
# Vector: a mathematically convenient representation of a document.
|
||||
# Model: an algorithm for transforming vectors from one representation to another.
|
||||
|
||||
document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
|
||||
|
||||
text_corpus = [
|
||||
"Human machine interface for lab abc computer applications",
|
||||
"A survey of user opinion of computer system response time",
|
||||
"The EPS user interface management system",
|
||||
"System and human system engineering testing of EPS",
|
||||
"Relation of user perceived response time to error measurement",
|
||||
"The generation of random binary unordered trees",
|
||||
"The intersection graph of paths in trees",
|
||||
"Graph minors IV Widths of trees and well quasi ordering",
|
||||
"Graph minors A survey",
|
||||
]
|
||||
|
||||
# Cleaning the corpus
|
||||
|
||||
# Create a set of frequent words
|
||||
stoplist = set('for a of the and to in'.split(' '))
|
||||
|
||||
# Lowercase each document, split it by white space and filter out stopwords
|
@ -0,0 +1,49 @@
|
||||
# Importing the modules
|
||||
|
||||
# import nltk
|
||||
# nltk.download('punkt')
|
||||
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings(action='ignore')
|
||||
|
||||
import gensim
|
||||
from gensim.models import Word2Vec
|
||||
|
||||
|
||||
# read the 11-0.txt file
|
||||
with open('11-0.txt', 'r') as f:
|
||||
text = f.read()
|
||||
|
||||
# replace escape char with space
|
||||
f = text.replace('\n', ' ')
|
||||
|
||||
data = []
|
||||
|
||||
# iterate through each sentence in the file
|
||||
for i in sent_tokenize(f):
|
||||
temp = []
|
||||
|
||||
for j in word_tokenize(i):
|
||||
temp.append(j.lower())
|
||||
|
||||
data.append(temp)
|
||||
|
||||
# create the CBOW model
|
||||
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
|
||||
|
||||
# print the results
|
||||
|
||||
# Print results
|
||||
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
|
||||
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
|
||||
|
||||
|
||||
|
||||
# create the Skip Gram model
|
||||
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
|
||||
|
||||
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
|
||||
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
|
||||
|
Loading…
Reference in New Issue