bo-graduation/nltk-book/pattern-master/examples/05-vector/04-KNN.py

from __future__ import print_function
from __future__ import unicode_literals

from builtins import str, bytes, dict, int
from builtins import range

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Model, KNN

# Classification is a supervised machine learning method,
# where labeled documents are used as training material
# to learn how to label unlabeled documents.

# This example trains a simple classifier with Twitter messages.
# The idea is that, if you have a number of texts with a "type"
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the given input document.

m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()               # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))               # parse tree with part-of-speech tags
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s] # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))

# Train k-Nearest Neighbor on the model.
# Note that this is only a simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if your classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None) # By default, baseline=MAJORITY
for document in m:              # (classify unknown documents with the most frequent type).
    classifier.train(document)

# These are the adjectives the classifier has learned:
print(sorted(classifier.features))
print()

# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print(classifier.classify('sweet potato burger')) # yields 'WIN'
print(classifier.classify('stupid autocorrect'))  # yields 'FAIL'

# "What can I do with it?"
# In the scientific community, classifiers have been used to predict:
# - the opinion (positive/negative) in product reviews on blogs,
# - the age of users posting on social networks,
# - the author of medieval poems,
# - spam in  e-mail messages,
# - lies & deception in text,
# - doubt & uncertainty in text,
# and to:
# - improve search engine query results (e.g., where "jeans" queries also yield "denim" results),
# - win at Jeopardy!,
# - win at rock-paper-scissors,
# and so on...