You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
2.9 KiB
Python
97 lines
2.9 KiB
Python
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
|
import os
|
|
import sys
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from pattern.web import Wiktionary, DOM
|
|
from pattern.db import csv, pd
|
|
|
|
# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
|
|
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
|
|
# The classifier is small (80KB) and fast.
|
|
|
|
w = Wiktionary(language="en")
|
|
f = csv() # csv() is a short alias for Datasheet().
|
|
|
|
# Collect male and female given names from Wiktionary.
|
|
# Store the data as (name, gender)-rows in a CSV-file.
|
|
# The pd() function returns the parent directory of the current script,
|
|
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.
|
|
|
|
for gender in ("male", "female"):
|
|
for ch in ("abcdefghijklmnopqrstuvwxyz"):
|
|
p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
|
|
for name in p.links:
|
|
if not name.startswith("Appendix:"):
|
|
f.append((name, gender[0]))
|
|
f.save(pd("given-names.csv"))
|
|
print(ch, gender)
|
|
|
|
# Create a classifier that predicts gender based on name.
|
|
|
|
from pattern.vector import SVM, chngrams, count, kfoldcv
|
|
|
|
|
|
class GenderByName(SVM):
|
|
|
|
def train(self, name, gender=None):
|
|
SVM.train(self, self.vector(name), gender)
|
|
|
|
def classify(self, name):
|
|
return SVM.classify(self, self.vector(name))
|
|
|
|
def vector(self, name):
|
|
""" Returns a dictionary with character bigrams and suffix.
|
|
For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
|
|
"""
|
|
v = chngrams(name, n=2)
|
|
v = count(v)
|
|
v[name[-2:] + "$"] = 1
|
|
v[len(name)] = 1
|
|
return v
|
|
|
|
data = csv(pd("given-names.csv"))
|
|
|
|
# Test average (accuracy, precision, recall, F-score, standard deviation).
|
|
|
|
print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00)
|
|
|
|
# Train and save the classifier in the current folder.
|
|
# With final=True, discards the original training data (= smaller file).
|
|
|
|
g = GenderByName(train=data)
|
|
g.save(pd("gender-by-name.svm"), final=True)
|
|
|
|
# Next time, we can simply load the trained classifier.
|
|
# Keep in mind that the script that loads the classifier
|
|
# must include the code for the GenderByName class description,
|
|
# otherwise Python won't know how to load the data.
|
|
|
|
g = GenderByName.load(pd("gender-by-name.svm"))
|
|
|
|
for name in (
|
|
"Felix",
|
|
"Felicia",
|
|
"Rover",
|
|
"Kitty",
|
|
"Legolas",
|
|
"Arwen",
|
|
"Jabba",
|
|
"Leia",
|
|
"Flash",
|
|
"Barbarella"):
|
|
print(name, g.classify(name))
|
|
|
|
# In the example above, Arwen and Jabba are misclassified.
|
|
# We can of course improve the classifier by hand:
|
|
|
|
#g.train("Arwen", gender="f")
|
|
#g.train("Jabba", gender="m")
|
|
#g.save(pd("gender-by-name.svm"), final=True)
|
|
#print(g.classify("Arwen"))
|
|
#print(g.classify("Jabba"))
|