first tests

3 years ago · 9b34554467
commit 9b34554467
5 changed files with 3840 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+venv/
--- a/11-0.txt
+++ b/11-0.txt
--- a/tutorials/gensim_core.py
+++ b/tutorials/gensim_core.py
@ -0,0 +1,29 @@
+# https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
+
+import pprint
+
+# Document: some text.
+# Corpus: a collection of documents.
+# Vector: a mathematically convenient representation of a document.
+# Model: an algorithm for transforming vectors from one representation to another.
+
+document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
+
+text_corpus = [
+    "Human machine interface for lab abc computer applications",
+    "A survey of user opinion of computer system response time",
+    "The EPS user interface management system",
+    "System and human system engineering testing of EPS",
+    "Relation of user perceived response time to error measurement",
+    "The generation of random binary unordered trees",
+    "The intersection graph of paths in trees",
+    "Graph minors IV Widths of trees and well quasi ordering",
+    "Graph minors A survey",
+]
+
+# Cleaning the corpus
+
+# Create a set of frequent words
+stoplist = set('for a of the and to in'.split(' '))
+
+# Lowercase each document, split it by white space and filter out stopwords
--- a/w2v.py
+++ b/w2v.py
@ -0,0 +1,49 @@
+# Importing the modules
+
+# import nltk
+# nltk.download('punkt')
+
+from nltk.tokenize import sent_tokenize, word_tokenize
+import warnings
+
+warnings.filterwarnings(action='ignore')
+
+import gensim
+from gensim.models import Word2Vec
+
+
+# read the 11-0.txt file
+with open('11-0.txt', 'r') as f:
+    text = f.read()
+
+# replace escape char with space
+f = text.replace('\n', ' ')
+
+data = []
+
+# iterate through each sentence in the file
+for i in sent_tokenize(f):
+    temp = []
+
+    for j in word_tokenize(i):
+        temp.append(j.lower())
+    
+    data.append(temp)
+
+# create the CBOW model
+model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
+
+# print the results
+
+# Print results
+print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
+print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
+     
+
+
+# create the Skip Gram model
+model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
+
+print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
+print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")
+