You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
3.9 KiB
Python
108 lines
3.9 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
import time
|
|
|
|
from pattern.vector import Document, Model, KNN
|
|
from pattern.db import Datasheet
|
|
|
|
# Long documents contain lots of words.
|
|
# Models with lots of long documents can become slow,
|
|
# because calculating cosine similarity then takes a long time.
|
|
|
|
# Latent Semantic Analysis (LSA) is a statistical machine learning method,
|
|
# based on a matrix calculation called "singular value decomposition" (SVD).
|
|
# It discovers semantically related words across documents.
|
|
# It groups related words into "concepts" .
|
|
# It then creates a concept vector for each document.
|
|
# This reduces the amount of data to work with (for example when clustering),
|
|
# and filters out noise, so that semantically related words come out stronger.
|
|
|
|
# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
|
|
# Take 250 positive reviews and 250 negative reviews:
|
|
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora", "polarity-en-pang&lee1.csv")
|
|
data = Datasheet.load(data)
|
|
data = data[:250] + data[-250:]
|
|
|
|
# Build a model of movie reviews.
|
|
# Each document consists of the top 40 words in the movie review.
|
|
documents = []
|
|
for score, review in data:
|
|
document = Document(review, stopwords=False, top=40, type=int(score) > 0)
|
|
documents.append(document)
|
|
|
|
m = Model(documents)
|
|
|
|
print("number of documents:", len(m))
|
|
print("number of features:", len(m.vector))
|
|
print("number of features (average):", sum(len(d.features) for d in m.documents) / float(len(m)))
|
|
print()
|
|
|
|
# 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering).
|
|
# We'll reduce the document vectors to 10 concepts.
|
|
|
|
# Let's test how our model performs as a classifier.
|
|
# A document can have a label (or type, or class).
|
|
# For example, in the movie reviews corpus,
|
|
# there are positive reviews (score > 0) and negative reviews (score < 0).
|
|
# A classifier uses a model as "training" data
|
|
# to predict the label (type/class) of unlabeled documents.
|
|
# In this case, it can predict whether a new movie review is positive or negative.
|
|
|
|
# The details are not that important right now, just observe the accuracy.
|
|
# Naturally, we want accuracy to stay the same after LSA reduction,
|
|
# and hopefully decrease the time needed to run.
|
|
|
|
t = time.time()
|
|
print("accuracy:", KNN.test(m, folds=10)[-1])
|
|
print("time:", time.time() - t)
|
|
print()
|
|
|
|
# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
|
|
print("LSA reduction...")
|
|
print()
|
|
m.reduce(10)
|
|
|
|
t = time.time()
|
|
print("accuracy:", KNN.test(m, folds=10)[-1])
|
|
print("time:", time.time() - t)
|
|
print()
|
|
|
|
# Accuracy is about the same, but the performance is better: 2x-3x faster,
|
|
# because each document is now a "10-word summary" of the original review.
|
|
|
|
# Let's take a closer look at the concepts.
|
|
# The concept vector for the first document:
|
|
print(m.lsa.vectors[m[0].id])
|
|
print()
|
|
|
|
# It is a dictionary of concept id's (instead of features).
|
|
# This is is not very helpful.
|
|
# But we can look up the features "bundled" in each concept:
|
|
print(len(m.lsa.concepts[0]))
|
|
|
|
# That's a lot of words.
|
|
# In fact, all features in the model have a score for one of the ten concepts.
|
|
|
|
# To make it clearer, let's generate 100 concepts (i.e., semantic categories),
|
|
# and then examine the features with the highest score for a concept:
|
|
|
|
m.lsa = None
|
|
m.reduce(100)
|
|
|
|
for feature, weight in m.lsa.concepts[15].items(): # concept id=2
|
|
if abs(weight) > 0.1:
|
|
print(feature)
|
|
|
|
# Concept 2 = "truman", "ventura", "ace", "carrey", ... Obviously about Jim Carrey movies.
|
|
# Concept 15 = "sixth", "sense", "child", "dead", "willis" ...
|
|
|
|
# Not all concepts are equally easy to interpret,
|
|
# but the technique can be useful to discover synonym sets.
|