You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
414 KiB
414 KiB
Word2Vec Tutorial¶
In [1]:
import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
import gensim.downloader as api wv = api.load('word2vec-google-news-300')
2022-05-30 14:30:49,470 : INFO : loading projection weights from C:\Users\Francesco/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz 2022-05-30 14:31:31,470 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\Francesco/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-05-30T14:31:31.470006', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'load_word2vec_format'}
In [4]:
for index, word in enumerate(wv.index_to_key): if index==100: break print(f'word {index} / {len(wv.index_to_key)} is {word}')
word 0 / 3000000 is </s> word 1 / 3000000 is in word 2 / 3000000 is for word 3 / 3000000 is that word 4 / 3000000 is is word 5 / 3000000 is on word 6 / 3000000 is ## word 7 / 3000000 is The word 8 / 3000000 is with word 9 / 3000000 is said word 10 / 3000000 is was word 11 / 3000000 is the word 12 / 3000000 is at word 13 / 3000000 is not word 14 / 3000000 is as word 15 / 3000000 is it word 16 / 3000000 is be word 17 / 3000000 is from word 18 / 3000000 is by word 19 / 3000000 is are word 20 / 3000000 is I word 21 / 3000000 is have word 22 / 3000000 is he word 23 / 3000000 is will word 24 / 3000000 is has word 25 / 3000000 is #### word 26 / 3000000 is his word 27 / 3000000 is an word 28 / 3000000 is this word 29 / 3000000 is or word 30 / 3000000 is their word 31 / 3000000 is who word 32 / 3000000 is they word 33 / 3000000 is but word 34 / 3000000 is $ word 35 / 3000000 is had word 36 / 3000000 is year word 37 / 3000000 is were word 38 / 3000000 is we word 39 / 3000000 is more word 40 / 3000000 is ### word 41 / 3000000 is up word 42 / 3000000 is been word 43 / 3000000 is you word 44 / 3000000 is its word 45 / 3000000 is one word 46 / 3000000 is about word 47 / 3000000 is would word 48 / 3000000 is which word 49 / 3000000 is out word 50 / 3000000 is can word 51 / 3000000 is It word 52 / 3000000 is all word 53 / 3000000 is also word 54 / 3000000 is two word 55 / 3000000 is after word 56 / 3000000 is first word 57 / 3000000 is He word 58 / 3000000 is do word 59 / 3000000 is time word 60 / 3000000 is than word 61 / 3000000 is when word 62 / 3000000 is We word 63 / 3000000 is over word 64 / 3000000 is last word 65 / 3000000 is new word 66 / 3000000 is other word 67 / 3000000 is her word 68 / 3000000 is people word 69 / 3000000 is into word 70 / 3000000 is In word 71 / 3000000 is our word 72 / 3000000 is there word 73 / 3000000 is A word 74 / 3000000 is she word 75 / 3000000 is could word 76 / 3000000 is just word 77 / 3000000 is years word 78 / 3000000 is some word 79 / 3000000 is U.S. word 80 / 3000000 is three word 81 / 3000000 is million word 82 / 3000000 is them word 83 / 3000000 is what word 84 / 3000000 is But word 85 / 3000000 is so word 86 / 3000000 is no word 87 / 3000000 is like word 88 / 3000000 is if word 89 / 3000000 is only word 90 / 3000000 is percent word 91 / 3000000 is get word 92 / 3000000 is did word 93 / 3000000 is him word 94 / 3000000 is game word 95 / 3000000 is back word 96 / 3000000 is because word 97 / 3000000 is now word 98 / 3000000 is #.# word 99 / 3000000 is before
In [8]:
vec_king = wv['queen']
In [9]:
try: vec_cameroon = wv['cameroon'] except KeyError: print("The word 'cameroon' does not appear in this model")
The word 'cameroon' does not appear in this model
In [11]:
pairs = [ ('car', 'minivan'), ('car', 'bicycle'), ('car', 'airplane'), ('car', 'cereal'), ('car', 'communism') ] for w1, w2 in pairs: print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))
'car' 'minivan' 0.69 'car' 'bicycle' 0.54 'car' 'airplane' 0.42 'car' 'cereal' 0.14 'car' 'communism' 0.06
In [16]:
print(wv.most_similar(positive=['car', 'bike'], topn=5))
[('bicycle', 0.7798016667366028), ('scooter', 0.7793240547180176), ('motorcycle', 0.7453441023826599), ('bikes', 0.7298908233642578), ('vehicle', 0.6923801302909851)]
In [23]:
print(wv.doesnt_match( ['fire', 'land', 'water', 'sea', 'air', 'jar']))
jar
Train your own model¶
In [24]:
from gensim.test.utils import datapath from gensim import utils class MyCorpus: """ An iterator that yields sentences (list of str)""" def __iter__(self): corpus_path = datapath('lee_background.cor') for line in open(corpus_path): # assume there's one document per line, tokens separated by whitespace yield utils.simple_preprocess(line)
In [25]:
import gensim.models sentences = MyCorpus() model = gensim.models.Word2Vec(sentences=sentences)
2022-05-30 14:47:56,513 : INFO : collecting all words and their counts 2022-05-30 14:47:56,515 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types 2022-05-30 14:47:56,584 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences 2022-05-30 14:47:56,585 : INFO : Creating a fresh vocabulary 2022-05-30 14:47:56,590 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 1750 unique words (25.07% of original 6981, drops 5231)', 'datetime': '2022-05-30T14:47:56.590672', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'} 2022-05-30 14:47:56,591 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 49335 word corpus (84.84% of original 58152, drops 8817)', 'datetime': '2022-05-30T14:47:56.591685', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'} 2022-05-30 14:47:56,599 : INFO : deleting the raw counts dictionary of 6981 items 2022-05-30 14:47:56,602 : INFO : sample=0.001 downsamples 51 most-common words 2022-05-30 14:47:56,602 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 35935.33721568072 word corpus (72.8%% of prior 49335)', 'datetime': '2022-05-30T14:47:56.602641', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'} 2022-05-30 14:47:56,619 : INFO : estimated required memory for 1750 words and 100 dimensions: 2275000 bytes 2022-05-30 14:47:56,621 : INFO : resetting layer weights 2022-05-30 14:47:56,623 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2022-05-30T14:47:56.623586', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'build_vocab'} 2022-05-30 14:47:56,624 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 1750 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-05-30T14:47:56.624583', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'} 2022-05-30 14:47:56,706 : INFO : EPOCH 0: training on 58152 raw words (35868 effective words) took 0.1s, 453792 effective words/s 2022-05-30 14:47:56,780 : INFO : EPOCH 1: training on 58152 raw words (35915 effective words) took 0.1s, 495146 effective words/s 2022-05-30 14:47:56,860 : INFO : EPOCH 2: training on 58152 raw words (35942 effective words) took 0.1s, 461119 effective words/s 2022-05-30 14:47:56,934 : INFO : EPOCH 3: training on 58152 raw words (35913 effective words) took 0.1s, 496536 effective words/s 2022-05-30 14:47:57,009 : INFO : EPOCH 4: training on 58152 raw words (35893 effective words) took 0.1s, 489118 effective words/s 2022-05-30 14:47:57,010 : INFO : Word2Vec lifecycle event {'msg': 'training on 290760 raw words (179531 effective words) took 0.4s, 465875 effective words/s', 'datetime': '2022-05-30T14:47:57.009845', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'train'} 2022-05-30 14:47:57,010 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=1750, vector_size=100, alpha=0.025>', 'datetime': '2022-05-30T14:47:57.010844', 'gensim': '4.2.0', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
In [29]:
for index, word in enumerate(model.wv.index_to_key): if index==10: break print('%r\t%r' % (index, word))
0 'the' 1 'to' 2 'of' 3 'in' 4 'and' 5 'he' 6 'is' 7 'for' 8 'on' 9 'said'
Visualizing Word Embedding¶
In [38]:
from sklearn.decomposition import IncrementalPCA # initial reduction from sklearn.manifold import TSNE # final reduction import numpy as np def reduce_dimensions(model): num_dimensions = 2 # final num dimensions (2D, 3D, ecc) # extract the words & their vectors, as numpy arrays vectors = np.asarray(model.wv.vectors) labels = np.asarray(model.wv.index_to_key) # reducing using t-SNE tsne = TSNE(n_components=num_dimensions, random_state=0) vectors = tsne.fit_transform(vectors) x_vals = [v[0] for v in vectors] y_vals = [v[1] for v in vectors] return x_vals, y_vals, labels x_vals, y_vals, labels = reduce_dimensions(model) def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True): from plotly.offline import init_notebook_mode, iplot, plot import plotly.graph_objs as go trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels) data = [trace] if plot_in_notebook: init_notebook_mode(connected=True) iplot(data, filename='word-embedding-plot') else: plot(data, filename='word-embedding-plot.html') def plot_with_matplotlib(x_vals, y_vals, labels): import matplotlib.pyplot as plt import random # random.seed(0) plt.figure(figsize=(12,12)) plt.scatter(x_vals, y_vals) # label randomly subsampled 25 data points indices = list(range(len(labels))) selected_indices = random.sample(indices, 25) for i in selected_indices: plt.annotate(labels[i], (x_vals[i], y_vals[i])) try: get_ipython() except Exception: plot_function = plot_with_matplotlib else: plot_function = plot_with_plotly # plot_function(x_vals, y_vals, labels) plot_with_matplotlib(x_vals, y_vals, labels)
c:\Users\Francesco\Documents\XPUB\w2v\venv\lib\site-packages\sklearn\manifold\_t_sne.py:795: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. c:\Users\Francesco\Documents\XPUB\w2v\venv\lib\site-packages\sklearn\manifold\_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
In [41]:
plot_with_matplotlib(x_vals, y_vals, labels)
In [42]:
plot_with_plotly(x_vals, y_vals, labels)
In [ ]: