first tests

3 years ago · 9b34554467
commit 9b34554467
5 changed files with 3840 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 venv/
--- a/11-0.txt
+++ b/11-0.txt
--- a/tutorials/gensim_core.py
+++ b/tutorials/gensim_core.py
@ -0,0 +1,29 @@
 # https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
 import pprint
 # Document: some text.
 # Corpus: a collection of documents.
 # Vector: a mathematically convenient representation of a document.
 # Model: an algorithm for transforming vectors from one representation to another.
 document = 'Lorem ipsum dolor sit amet eheh 123 gelato'
 text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
 ]
 # Cleaning the corpus
 # Create a set of frequent words
 stoplist = set('for a of the and to in'.split(' '))
 # Lowercase each document, split it by white space and filter out stopwords
--- a/w2v.py
+++ b/w2v.py
@ -0,0 +1,49 @@
 # Importing the modules
 # import nltk
 # nltk.download('punkt')
 from nltk.tokenize import sent_tokenize, word_tokenize
 import warnings
 warnings.filterwarnings(action='ignore')
 import gensim
 from gensim.models import Word2Vec
 # read the 11-0.txt file
 with open('11-0.txt', 'r') as f:
    text = f.read()
 # replace escape char with space
 f = text.replace('\n', ' ')
 data = []
 # iterate through each sentence in the file
 for i in sent_tokenize(f):
    temp = []
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)
 # create the CBOW model
 model_cbow = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
 # print the results
 # Print results
 print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_cbow.wv.similarity('alice', 'wonderland')}")
 print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_cbow.wv.similarity('alice', 'machines')}")
 # create the Skip Gram model
 model_sg = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
 print(f"Cosine similarity between 'alice' and 'wonderland' - CBOW : {model_sg.wv.similarity('alice', 'wonderland')}")
 print(f"Cosine similarity between 'alice' and 'machine' - CBOW : {model_sg.wv.similarity('alice', 'machines')}")