You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
2.9 KiB
Python

from __future__ import print_function
from __future__ import unicode_literals
from builtins import str, bytes, dict, int
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Wiktionary, DOM
from pattern.db import csv, pd
# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
# The classifier is small (80KB) and fast.
w = Wiktionary(language="en")
f = csv() # csv() is a short alias for Datasheet().
# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.
for gender in ("male", "female"):
for ch in ("abcdefghijklmnopqrstuvwxyz"):
p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
for name in p.links:
if not name.startswith("Appendix:"):
f.append((name, gender[0]))
f.save(pd("given-names.csv"))
print(ch, gender)
# Create a classifier that predicts gender based on name.
from pattern.vector import SVM, chngrams, count, kfoldcv
class GenderByName(SVM):
def train(self, name, gender=None):
SVM.train(self, self.vector(name), gender)
def classify(self, name):
return SVM.classify(self, self.vector(name))
def vector(self, name):
""" Returns a dictionary with character bigrams and suffix.
For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
"""
v = chngrams(name, n=2)
v = count(v)
v[name[-2:] + "$"] = 1
v[len(name)] = 1
return v
data = csv(pd("given-names.csv"))
# Test average (accuracy, precision, recall, F-score, standard deviation).
print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00)
# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).
g = GenderByName(train=data)
g.save(pd("gender-by-name.svm"), final=True)
# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.
g = GenderByName.load(pd("gender-by-name.svm"))
for name in (
"Felix",
"Felicia",
"Rover",
"Kitty",
"Legolas",
"Arwen",
"Jabba",
"Leia",
"Flash",
"Barbarella"):
print(name, g.classify(name))
# In the example above, Arwen and Jabba are misclassified.
# We can of course improve the classifier by hand:
#g.train("Arwen", gender="f")
#g.train("Jabba", gender="m")
#g.save(pd("gender-by-name.svm"), final=True)
#print(g.classify("Arwen"))
#print(g.classify("Jabba"))