# coding: utf-8 from __future__ import print_function from __future__ import unicode_literals from builtins import str, bytes, dict, int import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) # By default, parse() uses part-of-speech tags from the Penn Treebank tagset: # http://www.clips.ua.ac.be/pages/penn-treebank-tagset # It is a good idea to study the tagset and its abbreviations for a few minutes. from pattern.en import parse as parse_en print(parse_en("the black cats", chunks=False)) # the/DT black/JJ cat/NNS print("") # ... where DT = determiner, JJ = adjective, NN = noun. # This is true for all languages that Pattern supports: from pattern.de import parse as parse_de from pattern.es import parse as parse_es from pattern.fr import parse as parse_fr from pattern.it import parse as parse_it from pattern.nl import parse as parse_nl from pattern.ru import parse as parse_ru print(parse_de("die schwarzen Katzen", chunks=False)) # die/DT schwarze/JJ Katzen/NNS print(parse_es("los gatos negros", chunks=False)) # los/DT gatos/NNS negros/JJ print(parse_fr("les chats noirs", chunks=False)) # les/DT chats/NNS noirs/JJ print(parse_it("i gatti neri", chunks=False)) # i/DT gatti/NNS neri/JJ print(parse_nl("de zwarte katten", chunks=False)) # de/DT zwarte/JJ katten/NNS print(parse_ru("какой сегодня хороший день!", chunks=False)) # какой/DT сегодня/RB хороший/JJ день/NN !/. print("") # In some cases, this means the original tagset is mapped to Penn Treebank: # e.g., for German (STTS), Spanish (PAROLE), Dutch (WOTAN). from pattern.de import STTS from pattern.es import PAROLE from pattern.nl import WOTAN print(parse_de("die schwarzen Katzen", chunks=False, tagset=STTS)) print(parse_es("los gatos negros", chunks=False, tagset=PAROLE)) print(parse_nl("de zwarte katten", chunks=False, tagset=WOTAN)) print("") # Not all languages are equally suited to Penn Treebank, # which was originally developed for English. # This becomes more problematic as more languages are added to Pattern. # It is sometimes difficult to fit determiners, pronouns, prepositions # in a particular language to Penn Treebank tags (e.g., Italian "che"). # With parse(tagset=UNIVERSAL), a simplified universal tagset is used, # loosely corresponding to the recommendations of Petrov (2012): # http://www.petrovi.de/data/lrec.pdf # This simplified tagset will still contain all the information that most users require. from pattern.text import UNIVERSAL from pattern.text import NOUN, VERB, ADJ, ADV, PRON, DET, PREP, NUM, CONJ, INTJ, PRT, PUNC, X # NOUN = "NN" (noun) # VERB = "VB" (verb) # ADJ = "JJ" (adjective) # ADV = "RB" (adverb) # PRON = "PR" (pronoun) # DET = "DT" (determiner) # PREP = "PP" (preposition) # NUM = "NO" (number) # CONJ = "CJ" (conjunction) # INTJ = "UH" (interjection) # PRT = "PT" (particle) # PUNC = "." (punctuation) # X = "X" (foreign word, abbreviation) # We can combine this with the multilingual pattern.text.parse() function, # when we need to deal with code that handles many languages at once: from pattern.text import parse print(parse("die schwarzen Katzen", chunks=False, language="de", tagset=UNIVERSAL)) print(parse("the black cats", chunks=False, language="en", tagset=UNIVERSAL)) print(parse("los gatos negros", chunks=False, language="es", tagset=UNIVERSAL)) print(parse("les chats noirs", chunks=False, language="fr", tagset=UNIVERSAL)) print(parse("i gatti neri", chunks=False, language="it", tagset=UNIVERSAL)) print(parse("de zwarte katten", chunks=False, language="nl", tagset=UNIVERSAL)) print("") # This comes at the expense of (in this example) losing information about plural nouns (NNS => NN). # But it may be more comfortable for you to build multilingual apps # using the universal constants (e.g., PRON, PREP, CONJ), # instead of learning the Penn Treebank tagset by heart, # or wonder why the Italian "che" is tagged "PRP", "IN" or "CC" # (in the universal tagset it is a PRON or a CONJ). from pattern.text import parsetree for sentence in parsetree("i gatti neri che sono la mia", language="it", tagset=UNIVERSAL): for word in sentence.words: if word.tag == PRON: print(word) # The language() function in pattern.text can be used to guess the language of a text. # It returns a (language code, confidence)-tuple. # It can guess en, es, de, fr, it, nl. from pattern.text import language print("") print(language("the cat sat on the mat")) # ("en", 1.00) print(language("de kat zat op de mat")) # ("nl", 0.80) print(language("le chat s'était assis sur le tapis")) # ("fr", 0.86)