bo-graduation/nltk-book/pattern-master/examples/05-vector/02-model.py

from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import glob

from io import open

from pattern.vector import Document, Model, TF, TFIDF

# A documents is a bag-of-word representations of a text.
# Each word or feature in the document vector has a weight,
# based on how many times the word occurs in the text.
# This weight is called term frequency (TF).

# Another interesting measure is TF-IDF:
# term frequency-inverse document frequency.
# Suppose that "the" is the most frequent word in the text.
# But it also occurs frequently in many other texts,
# so it is not very specific or "unique" in any one document.
# TF-IDF divided term frequency ("how many times in this text?")
# by the document frequency ("how many times in all texts?")
# to represent this.

# A Model is a collection of documents vectors.
# A Model is a matrix (or vector space)
# with features as columns and feature weights as rows.
# We can then do calculations on the matrix,
# for example to compute TF-IDF or similarity between documents.

# Load a model from a folder of text documents:
documents = []
for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")):
    text = open(f, encoding="utf-8").read()
    name = os.path.basename(f)[:-4]
    documents.append(Document(text, name=name))

m = Model(documents, weight=TFIDF)

# We can retrieve documents by name:
d = m.document(name="lion")

print(d.keywords(top=10))
print()
print(d.tf("food"))
print(d.tfidf("food")) # TF-IDF is less: "food" is also mentioned with the other animals.
print()

# We can compare how similar two documents are.
# This is done by calculating the distance between the document vectors
# (i.e., finding those that are near to each other).

# For example, say we have two vectors with features "x" and "y".
# We can calculate the distance between two points (x, y) in 2-D space:
# d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))
# This is the Euclidean distance in 2-D space.
# Similarily, we can calculate the distance in n-D space,
# in other words, for vectors with lots of features.

# For text, a better metric than Euclidean distance
# is called cosine similarity. This is what a Model uses:
d1 = m.document(name="lion")
d2 = m.document(name="tiger")
d3 = m.document(name="dolphin")
d4 = m.document(name="shark")
d5 = m.document(name="parakeet")
print("lion-tiger:", m.similarity(d1, d2))
print("lion-dolphin:", m.similarity(d1, d3))
print("dolphin-shark:", m.similarity(d3, d4))
print("dolphin-parakeet:", m.similarity(d3, d5))
print()

print("Related to tiger:")
print(m.neighbors(d2, top=3)) # Top three most similar.
print()

print("Related to a search query ('water'):")
print(m.search("water", top=10))

# In summary:

# A Document:
# - takes a string of text,
# - counts the words in the text,
# - constructs a vector of words (features) and normalized word count (weight).

# A Model:
# - groups multiple vectors in a matrix,
# - tweaks the weight with TF-IDF to find "unique" words in each document,
# - computes cosine similarity (= distance between vectors),
# - compares documents using cosine similatity.
thrid updates 5 years ago			`from __future__ import print_function`
			`from __future__ import unicode_literals`

			`from builtins import str, bytes, dict, int`

			`import os`
			`import sys`
			`sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))`
			`import glob`

			`from io import open`

			`from pattern.vector import Document, Model, TF, TFIDF`

			`# A documents is a bag-of-word representations of a text.`
			`# Each word or feature in the document vector has a weight,`
			`# based on how many times the word occurs in the text.`
			`# This weight is called term frequency (TF).`

			`# Another interesting measure is TF-IDF:`
			`# term frequency-inverse document frequency.`
			`# Suppose that "the" is the most frequent word in the text.`
			`# But it also occurs frequently in many other texts,`
			`# so it is not very specific or "unique" in any one document.`
			`# TF-IDF divided term frequency ("how many times in this text?")`
			`# by the document frequency ("how many times in all texts?")`
			`# to represent this.`

			`# A Model is a collection of documents vectors.`
			`# A Model is a matrix (or vector space)`
			`# with features as columns and feature weights as rows.`
			`# We can then do calculations on the matrix,`
			`# for example to compute TF-IDF or similarity between documents.`

			`# Load a model from a folder of text documents:`
			`documents = []`
			`for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")):`
			`text = open(f, encoding="utf-8").read()`
			`name = os.path.basename(f)[:-4]`
			`documents.append(Document(text, name=name))`

			`m = Model(documents, weight=TFIDF)`

			`# We can retrieve documents by name:`
			`d = m.document(name="lion")`

			`print(d.keywords(top=10))`
			`print()`
			`print(d.tf("food"))`
			`print(d.tfidf("food")) # TF-IDF is less: "food" is also mentioned with the other animals.`
			`print()`

			`# We can compare how similar two documents are.`
			`# This is done by calculating the distance between the document vectors`
			`# (i.e., finding those that are near to each other).`

			`# For example, say we have two vectors with features "x" and "y".`
			`# We can calculate the distance between two points (x, y) in 2-D space:`
			`# d = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2))`
			`# This is the Euclidean distance in 2-D space.`
			`# Similarily, we can calculate the distance in n-D space,`
			`# in other words, for vectors with lots of features.`

			`# For text, a better metric than Euclidean distance`
			`# is called cosine similarity. This is what a Model uses:`
			`d1 = m.document(name="lion")`
			`d2 = m.document(name="tiger")`
			`d3 = m.document(name="dolphin")`
			`d4 = m.document(name="shark")`
			`d5 = m.document(name="parakeet")`
			`print("lion-tiger:", m.similarity(d1, d2))`
			`print("lion-dolphin:", m.similarity(d1, d3))`
			`print("dolphin-shark:", m.similarity(d3, d4))`
			`print("dolphin-parakeet:", m.similarity(d3, d5))`
			`print()`

			`print("Related to tiger:")`
			`print(m.neighbors(d2, top=3)) # Top three most similar.`
			`print()`

			`print("Related to a search query ('water'):")`
			`print(m.search("water", top=10))`

			`# In summary:`

			`# A Document:`
			`# - takes a string of text,`
			`# - counts the words in the text,`
			`# - constructs a vector of words (features) and normalized word count (weight).`

			`# A Model:`
			`# - groups multiple vectors in a matrix,`
			`# - tweaks the weight with TF-IDF to find "unique" words in each document,`
			`# - computes cosine similarity (= distance between vectors),`
			`# - compares documents using cosine similatity.`