You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.3 KiB
Python
49 lines
1.3 KiB
Python
# Importing the modules
|
|
|
|
# import nltk
|
|
# nltk.download('punkt')
|
|
|
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
import warnings
|
|
|
|
warnings.filterwarnings(action='ignore')
|
|
|
|
import gensim
|
|
from gensim.models import Word2Vec
|
|
|
|
|
|
# read the 11-0.txt file
|
|
with open('11-0.txt', 'r') as f:
|
|
text = f.read()
|
|
|
|
# replace escape char with space
|
|
f = text.replace('\n', ' ')
|
|
|
|
data = []
|
|
|
|
# iterate through each sentence in the file
|
|
for i in sent_tokenize(f):
|
|
temp = []
|
|
|
|
for j in word_tokenize(i):
|
|
temp.append(j.lower())
|
|
|
|
data.append(temp)
|
|
|
|
# create the CBOW model
|
|
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
|
|
|
|
# print the results
|
|
|
|
# Print results
|
|
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
|
|
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
|
|
|
|
|
|
|
|
# create the Skip Gram model
|
|
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
|
|
|
|
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
|
|
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
|
|
|