You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.4 KiB
Python

5 years ago
from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
from builtins import range
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Twitter
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Model, KNN
# Classification is a supervised machine learning method,
# where labeled documents are used as training material
# to learn how to label unlabeled documents.
# This example trains a simple classifier with Twitter messages.
# The idea is that, if you have a number of texts with a "type"
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the given input document.
m = Model()
t = Twitter()
# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
# If the tweet contains #win hashtag, we'll set its type to 'WIN':
s = tweet.text.lower() # tweet in lowercase
p = '#win' in s and 'WIN' or 'FAIL' # document labels
s = Sentence(parse(s)) # parse tree with part-of-speech tags
s = search('JJ', s) # adjectives in the tweet
s = [match[0].string for match in s] # adjectives as a list of strings
s = " ".join(s) # adjectives as string
if len(s) > 0:
m.append(Document(s, type=p, stemmer=None))
# Train k-Nearest Neighbor on the model.
# Note that this is only a simple example: to build a robust classifier
# you would need a lot more training data (e.g., tens of thousands of tweets).
# The more training data, the more statistically reliable the classifier becomes.
# The only way to really know if your classifier is working correctly
# is to test it with testing data, see the documentation for Classifier.test().
classifier = KNN(baseline=None) # By default, baseline=MAJORITY
for document in m: # (classify unknown documents with the most frequent type).
classifier.train(document)
# These are the adjectives the classifier has learned:
print(sorted(classifier.features))
print()
# We can now ask it to classify documents containing these words.
# Note that you may get different results than the ones below,
# since you will be mining other (more recent) tweets.
# Again, a robust classifier needs lots and lots of training data.
# If None is returned, the word was not recognized,
# and the classifier returned the default value (see above).
print(classifier.classify('sweet potato burger')) # yields 'WIN'
print(classifier.classify('stupid autocorrect')) # yields 'FAIL'
# "What can I do with it?"
# In the scientific community, classifiers have been used to predict:
# - the opinion (positive/negative) in product reviews on blogs,
# - the age of users posting on social networks,
# - the author of medieval poems,
# - spam in e-mail messages,
# - lies & deception in text,
# - doubt & uncertainty in text,
# and to:
# - improve search engine query results (e.g., where "jeans" queries also yield "denim" results),
# - win at Jeopardy!,
# - win at rock-paper-scissors,
# and so on...