glueberry/w2v.py

# Importing the modules

# import nltk
# nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action='ignore')

import gensim
from gensim.models import Word2Vec


# read the 11-0.txt file
with open('11-0.txt', 'r') as f:
    text = f.read()

# replace escape char with space
f = text.replace('\n', ' ')

data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    for j in word_tokenize(i):
        temp.append(j.lower())

    data.append(temp)

# create the CBOW model
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)

# print the results

# Print results
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")


# create the Skip Gram model
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)

print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")