first tests

master
km0 2 years ago
commit 9b34554467

BIN
.DS_Store vendored

Binary file not shown.

1
.gitignore vendored

@ -0,0 +1 @@
venv/

3761
11-0.txt

File diff suppressed because it is too large Load Diff

@ -0,0 +1,29 @@
# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
import pprint
# Document: some text.
# Corpus: a collection of documents.
# Vector: a mathematically convenient representation of a document.
# Model: an algorithm for transforming vectors from one representation to another.
document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
text_corpus = [
"Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey",
]
# Cleaning the corpus
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords

@ -0,0 +1,49 @@
# Importing the modules
# import nltk
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')
import gensim
from gensim.models import Word2Vec
# read the 11-0.txt file
with open('11-0.txt', 'r') as f:
text = f.read()
# replace escape char with space
f = text.replace('\n', ' ')
data = []
# iterate through each sentence in the file
for i in sent_tokenize(f):
temp = []
for j in word_tokenize(i):
temp.append(j.lower())
data.append(temp)
# create the CBOW model
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
# print the results
# Print results
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
# create the Skip Gram model
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
Loading…
Cancel
Save