from __future__ import print_function from __future__ import unicode_literals from builtins import str, bytes, dict, int import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from pattern.web import Wiktionary, DOM from pattern.db import csv, pd # This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org). # It then trains a classifier that can predict the gender of unknown names (about 78% correct). # The classifier is small (80KB) and fast. w = Wiktionary(language="en") f = csv() # csv() is a short alias for Datasheet(). # Collect male and female given names from Wiktionary. # Store the data as (name, gender)-rows in a CSV-file. # The pd() function returns the parent directory of the current script, # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv. for gender in ("male", "female"): for ch in ("abcdefghijklmnopqrstuvwxyz"): p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True) for name in p.links: if not name.startswith("Appendix:"): f.append((name, gender[0])) f.save(pd("given-names.csv")) print(ch, gender) # Create a classifier that predicts gender based on name. from pattern.vector import SVM, chngrams, count, kfoldcv class GenderByName(SVM): def train(self, name, gender=None): SVM.train(self, self.vector(name), gender) def classify(self, name): return SVM.classify(self, self.vector(name)) def vector(self, name): """ Returns a dictionary with character bigrams and suffix. For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1} """ v = chngrams(name, n=2) v = count(v) v[name[-2:] + "$"] = 1 v[len(name)] = 1 return v data = csv(pd("given-names.csv")) # Test average (accuracy, precision, recall, F-score, standard deviation). print(kfoldcv(GenderByName, data, folds=3)) # (0.81, 0.79, 0.77, 0.78, 0.00) # Train and save the classifier in the current folder. # With final=True, discards the original training data (= smaller file). g = GenderByName(train=data) g.save(pd("gender-by-name.svm"), final=True) # Next time, we can simply load the trained classifier. # Keep in mind that the script that loads the classifier # must include the code for the GenderByName class description, # otherwise Python won't know how to load the data. g = GenderByName.load(pd("gender-by-name.svm")) for name in ( "Felix", "Felicia", "Rover", "Kitty", "Legolas", "Arwen", "Jabba", "Leia", "Flash", "Barbarella"): print(name, g.classify(name)) # In the example above, Arwen and Jabba are misclassified. # We can of course improve the classifier by hand: #g.train("Arwen", gender="f") #g.train("Jabba", gender="m") #g.save(pd("gender-by-name.svm"), final=True) #print(g.classify("Arwen")) #print(g.classify("Jabba"))