You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

414 KiB

Word2Vec Tutorial

GENSIM Tutorial

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')
2022-05-30 14:30:49,470 : INFO : loading projection weights from C:\Users\Francesco/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2022-05-30 14:31:31,470 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\Francesco/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-05-30T14:31:31.470006', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'load_word2vec_format'}
In [4]:
for index, word in enumerate(wv.index_to_key):
    if index==100:
        break
    print(f'word {index} / {len(wv.index_to_key)} is {word}')
word 0 / 3000000 is </s>
word 1 / 3000000 is in
word 2 / 3000000 is for
word 3 / 3000000 is that
word 4 / 3000000 is is
word 5 / 3000000 is on
word 6 / 3000000 is ##
word 7 / 3000000 is The
word 8 / 3000000 is with
word 9 / 3000000 is said
word 10 / 3000000 is was
word 11 / 3000000 is the
word 12 / 3000000 is at
word 13 / 3000000 is not
word 14 / 3000000 is as
word 15 / 3000000 is it
word 16 / 3000000 is be
word 17 / 3000000 is from
word 18 / 3000000 is by
word 19 / 3000000 is are
word 20 / 3000000 is I
word 21 / 3000000 is have
word 22 / 3000000 is he
word 23 / 3000000 is will
word 24 / 3000000 is has
word 25 / 3000000 is ####
word 26 / 3000000 is his
word 27 / 3000000 is an
word 28 / 3000000 is this
word 29 / 3000000 is or
word 30 / 3000000 is their
word 31 / 3000000 is who
word 32 / 3000000 is they
word 33 / 3000000 is but
word 34 / 3000000 is $
word 35 / 3000000 is had
word 36 / 3000000 is year
word 37 / 3000000 is were
word 38 / 3000000 is we
word 39 / 3000000 is more
word 40 / 3000000 is ###
word 41 / 3000000 is up
word 42 / 3000000 is been
word 43 / 3000000 is you
word 44 / 3000000 is its
word 45 / 3000000 is one
word 46 / 3000000 is about
word 47 / 3000000 is would
word 48 / 3000000 is which
word 49 / 3000000 is out
word 50 / 3000000 is can
word 51 / 3000000 is It
word 52 / 3000000 is all
word 53 / 3000000 is also
word 54 / 3000000 is two
word 55 / 3000000 is after
word 56 / 3000000 is first
word 57 / 3000000 is He
word 58 / 3000000 is do
word 59 / 3000000 is time
word 60 / 3000000 is than
word 61 / 3000000 is when
word 62 / 3000000 is We
word 63 / 3000000 is over
word 64 / 3000000 is last
word 65 / 3000000 is new
word 66 / 3000000 is other
word 67 / 3000000 is her
word 68 / 3000000 is people
word 69 / 3000000 is into
word 70 / 3000000 is In
word 71 / 3000000 is our
word 72 / 3000000 is there
word 73 / 3000000 is A
word 74 / 3000000 is she
word 75 / 3000000 is could
word 76 / 3000000 is just
word 77 / 3000000 is years
word 78 / 3000000 is some
word 79 / 3000000 is U.S.
word 80 / 3000000 is three
word 81 / 3000000 is million
word 82 / 3000000 is them
word 83 / 3000000 is what
word 84 / 3000000 is But
word 85 / 3000000 is so
word 86 / 3000000 is no
word 87 / 3000000 is like
word 88 / 3000000 is if
word 89 / 3000000 is only
word 90 / 3000000 is percent
word 91 / 3000000 is get
word 92 / 3000000 is did
word 93 / 3000000 is him
word 94 / 3000000 is game
word 95 / 3000000 is back
word 96 / 3000000 is because
word 97 / 3000000 is now
word 98 / 3000000 is #.#
word 99 / 3000000 is before
In [8]:
vec_king = wv['queen']
In [9]:
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")
The word 'cameroon' does not appear in this model
In [11]:
pairs = [
    ('car', 'minivan'),
    ('car', 'bicycle'),
    ('car', 'airplane'),
    ('car', 'cereal'),
    ('car', 'communism')
]

for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))
'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06
In [16]:
print(wv.most_similar(positive=['car', 'bike'], topn=5))
[('bicycle', 0.7798016667366028), ('scooter', 0.7793240547180176), ('motorcycle', 0.7453441023826599), ('bikes', 0.7298908233642578), ('vehicle', 0.6923801302909851)]
In [23]:
print(wv.doesnt_match(
    ['fire', 'land', 'water', 'sea', 'air', 'jar']))
jar

Train your own model

In [24]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """ An iterator that yields sentences (list of str)"""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)
In [25]:
import gensim.models

sentences  = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)
2022-05-30 14:47:56,513 : INFO : collecting all words and their counts
2022-05-30 14:47:56,515 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-05-30 14:47:56,584 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2022-05-30 14:47:56,585 : INFO : Creating a fresh vocabulary
2022-05-30 14:47:56,590 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1750 unique words (25.07% of original 6981, drops 5231)', 'datetime': '2022-05-30T14:47:56.590672', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2022-05-30 14:47:56,591 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 49335 word corpus (84.84% of original 58152, drops 8817)', 'datetime': '2022-05-30T14:47:56.591685', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2022-05-30 14:47:56,599 : INFO : deleting the raw counts dictionary of 6981 items
2022-05-30 14:47:56,602 : INFO : sample=0.001 downsamples 51 most-common words
2022-05-30 14:47:56,602 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 35935.33721568072 word corpus (72.8%% of prior 49335)', 'datetime': '2022-05-30T14:47:56.602641', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2022-05-30 14:47:56,619 : INFO : estimated required memory for 1750 words and 100 dimensions: 2275000 bytes
2022-05-30 14:47:56,621 : INFO : resetting layer weights
2022-05-30 14:47:56,623 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2022-05-30T14:47:56.623586', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'build_vocab'}
2022-05-30 14:47:56,624 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1750 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-05-30T14:47:56.624583', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'}
2022-05-30 14:47:56,706 : INFO : EPOCH 0: training on 58152 raw words (35868 effective words) took 0.1s, 453792 effective words/s
2022-05-30 14:47:56,780 : INFO : EPOCH 1: training on 58152 raw words (35915 effective words) took 0.1s, 495146 effective words/s
2022-05-30 14:47:56,860 : INFO : EPOCH 2: training on 58152 raw words (35942 effective words) took 0.1s, 461119 effective words/s
2022-05-30 14:47:56,934 : INFO : EPOCH 3: training on 58152 raw words (35913 effective words) took 0.1s, 496536 effective words/s
2022-05-30 14:47:57,009 : INFO : EPOCH 4: training on 58152 raw words (35893 effective words) took 0.1s, 489118 effective words/s
2022-05-30 14:47:57,010 : INFO : Word2Vec lifecycle event {'msg': 'training on 290760 raw words (179531 effective words) took 0.4s, 465875 effective words/s', 'datetime': '2022-05-30T14:47:57.009845', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'}
2022-05-30 14:47:57,010 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=1750, vector_size=100, alpha=0.025>', 'datetime': '2022-05-30T14:47:57.010844', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
In [29]:
for index, word in enumerate(model.wv.index_to_key):
    if index==10:
        break
    print('%r\t%r' % (index, word))
0	'the'
1	'to'
2	'of'
3	'in'
4	'and'
5	'he'
6	'is'
7	'for'
8	'on'
9	'said'

Visualizing Word Embedding

In [38]:
from sklearn.decomposition import IncrementalPCA # initial reduction
from sklearn.manifold import TSNE # final reduction
import numpy as np

def reduce_dimensions(model):
    num_dimensions = 2 # final num dimensions (2D, 3D, ecc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)

    # reducing using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random
    
    # random.seed(0)

    plt.figure(figsize=(12,12))
    plt.scatter(x_vals, y_vals)

    # label randomly subsampled 25 data points

    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

# plot_function(x_vals, y_vals, labels)


plot_with_matplotlib(x_vals, y_vals, labels)
c:\Users\Francesco\Documents\XPUB\w2v\venv\lib\site-packages\sklearn\manifold\_t_sne.py:795: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

c:\Users\Francesco\Documents\XPUB\w2v\venv\lib\site-packages\sklearn\manifold\_t_sne.py:805: FutureWarning:

The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

In [41]:
plot_with_matplotlib(x_vals, y_vals, labels)
In [42]:
plot_with_plotly(x_vals, y_vals, labels)
In [ ]: