You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.3 KiB
Python

# Importing the modules
# import nltk
# nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')
import gensim
from gensim.models import Word2Vec
# read the 11-0.txt file
with open('11-0.txt', 'r') as f:
text = f.read()
# replace escape char with space
f = text.replace('\n', ' ')
data = []
# iterate through each sentence in the file
for i in sent_tokenize(f):
temp = []
for j in word_tokenize(i):
temp.append(j.lower())
data.append(temp)
# create the CBOW model
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
# print the results
# Print results
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
# create the Skip Gram model
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")