glueberry/w2v.py

# Importing the modules

# import nltk
# nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize
import warnings

warnings.filterwarnings(action='ignore')

import gensim
from gensim.models import Word2Vec


# read the 11-0.txt file
with open('11-0.txt', 'r') as f:
    text = f.read()

# replace escape char with space
f = text.replace('\n', ' ')

data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    for j in word_tokenize(i):
        temp.append(j.lower())
    
    data.append(temp)

# create the CBOW model
model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)

# print the results

# Print results
print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
     

# create the Skip Gram model
model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)

print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
first tests 3 years ago			`# Importing the modules`

			`# import nltk`
			`# nltk.download('punkt')`

			`from nltk.tokenize import sent_tokenize, word_tokenize`
			`import warnings`

			`warnings.filterwarnings(action='ignore')`

			`import gensim`
			`from gensim.models import Word2Vec`


			`# read the 11-0.txt file`
			`with open('11-0.txt', 'r') as f:`
			`text = f.read()`

			`# replace escape char with space`
			`f = text.replace('\n', ' ')`

			`data = []`

			`# iterate through each sentence in the file`
			`for i in sent_tokenize(f):`
			`temp = []`

			`for j in word_tokenize(i):`
			`temp.append(j.lower())`

			`data.append(temp)`

			`# create the CBOW model`
			`model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)`

			`# print the results`

			`# Print results`
			`print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")`
			`print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")`



			`# create the Skip Gram model`
			`model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)`

			`print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")`
			`print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")`