You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
283 lines
8.7 KiB
Python
283 lines
8.7 KiB
Python
#### PATTERN | ES ##################################################################################
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (c) 2012 University of Antwerp, Belgium
|
|
# Author: Tom De Smedt <tom@organisms.be>
|
|
# License: BSD (see LICENSE.txt for details).
|
|
# http://www.clips.ua.ac.be/pages/pattern
|
|
|
|
####################################################################################################
|
|
# Spanish linguistical tools using fast regular expressions.
|
|
|
|
from __future__ import unicode_literals
|
|
from __future__ import division
|
|
|
|
from builtins import str, bytes, dict, int
|
|
from builtins import map, zip, filter
|
|
from builtins import object, range
|
|
|
|
import os
|
|
import sys
|
|
|
|
try:
|
|
MODULE = os.path.dirname(os.path.realpath(__file__))
|
|
except:
|
|
MODULE = ""
|
|
|
|
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
|
|
|
|
# Import parser base classes.
|
|
from pattern.text import (
|
|
Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,
|
|
PUNCTUATION
|
|
)
|
|
# Import parser universal tagset.
|
|
from pattern.text import (
|
|
penntreebank2universal,
|
|
PTB, PENN, UNIVERSAL,
|
|
NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X
|
|
)
|
|
# Import parse tree base classes.
|
|
from pattern.text.tree import (
|
|
Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,
|
|
SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR
|
|
)
|
|
# Import spelling base class.
|
|
from pattern.text import (
|
|
Spelling
|
|
)
|
|
# Import verb tenses.
|
|
from pattern.text import (
|
|
INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,
|
|
FIRST, SECOND, THIRD,
|
|
SINGULAR, PLURAL, SG, PL,
|
|
INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
|
|
IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
|
|
IMPERFECT, PRETERITE,
|
|
PARTICIPLE, GERUND
|
|
)
|
|
# Import inflection functions.
|
|
from pattern.text.es.inflect import (
|
|
article, referenced, DEFINITE, INDEFINITE,
|
|
MASCULINE, MALE, FEMININE, FEMALE, NEUTER, NEUTRAL, PLURAL, M, F, N, PL,
|
|
pluralize, singularize, NOUN, VERB, ADJECTIVE,
|
|
verbs, conjugate, lemma, lexeme, tenses,
|
|
predicative, attributive
|
|
)
|
|
# Import all submodules.
|
|
from pattern.text.es import inflect
|
|
|
|
sys.path.pop(0)
|
|
|
|
#--- SPANISH PARSER --------------------------------------------------------------------------------
|
|
# The Spanish parser (accuracy 92%) is based on the Spanish portion Wikicorpus v.1.0 (FDL license),
|
|
# using 1.5M words from the tagged sections 10000-15000.
|
|
# Samuel Reese, Gemma Boleda, Montse Cuadros, Lluís Padró, German Rigau.
|
|
# Wikicorpus: A Word-Sense Disambiguated Multilingual Wikipedia Corpus.
|
|
# Proceedings of 7th Language Resources and Evaluation Conference (LREC'10),
|
|
# La Valleta, Malta. May, 2010.
|
|
# http://www.lsi.upc.edu/~nlp/wikicorpus/
|
|
|
|
# The lexicon uses the Parole tagset:
|
|
# http://www.lsi.upc.edu/~nlp/SVMTool/parole.html
|
|
# http://nlp.lsi.upc.edu/freeling/doc/tagsets/tagset-es.html
|
|
PAROLE = "parole"
|
|
parole = {
|
|
"AO": "JJ", # primera
|
|
"AQ": "JJ", # absurdo
|
|
"CC": "CC", # e
|
|
"CS": "IN", # porque
|
|
"DA": "DT", # el
|
|
"DD": "DT", # ese
|
|
"DI": "DT", # mucha
|
|
"DP": "PRP$", # mi, nuestra
|
|
"DT": "DT", # cuántos
|
|
"Fa": ".", # !
|
|
"Fc": ",", # ,
|
|
"Fd": ":", # :
|
|
"Fe": "\"", # "
|
|
"Fg": ".", # -
|
|
"Fh": ".", # /
|
|
"Fi": ".", # ?
|
|
"Fp": ".", # .
|
|
"Fr": ".", # >>
|
|
"Fs": ".", # ...
|
|
"Fpa": "(", # (
|
|
"Fpt": ")", # )
|
|
"Fx": ".", # ;
|
|
"Fz": ".", #
|
|
"I": "UH", # ehm
|
|
"NC": "NN", # islam
|
|
"NCS": "NN", # guitarra
|
|
"NCP": "NNS", # guitarras
|
|
"NP": "NNP", # Óscar
|
|
"P0": "PRP", # se
|
|
"PD": "DT", # ése
|
|
"PI": "DT", # uno
|
|
"PP": "PRP", # vos
|
|
"PR": "WP$", # qué
|
|
"PT": "WP$", # qué
|
|
"PX": "PRP$", # mío
|
|
"RG": "RB", # tecnológicamente
|
|
"RN": "RB", # no
|
|
"SP": "IN", # por
|
|
"VAG": "VBG", # habiendo
|
|
"VAI": "MD", # había
|
|
"VAN": "MD", # haber
|
|
"VAS": "MD", # haya
|
|
"VMG": "VBG", # habiendo
|
|
"VMI": "VB", # habemos
|
|
"VMM": "VB", # compare
|
|
"VMN": "VB", # comparecer
|
|
"VMP": "VBN", # comparando
|
|
"VMS": "VB", # compararan
|
|
"VSG": "VBG", # comparando
|
|
"VSI": "VB", # será
|
|
"VSN": "VB", # ser
|
|
"VSP": "VBN", # sido
|
|
"VSS": "VB", # sea
|
|
"W": "NN", # septiembre
|
|
"Z": "CD", # 1,7
|
|
"Zd": "CD", # 1,7
|
|
"Zm": "CD", # £1,7
|
|
"Zp": "CD", # 1,7%
|
|
}
|
|
|
|
|
|
def parole2penntreebank(token, tag):
|
|
""" Converts a Parole tag to a Penn Treebank II tag.
|
|
For example: importantísimo/AQ => importantísimo/ADJ
|
|
"""
|
|
return (token, parole.get(tag, tag))
|
|
|
|
|
|
def parole2universal(token, tag):
|
|
""" Converts a Parole tag to a universal tag.
|
|
For example: importantísimo/AQ => importantísimo/ADJ
|
|
"""
|
|
if tag == "CS":
|
|
return (token, CONJ)
|
|
if tag == "DP":
|
|
return (token, DET)
|
|
if tag in ("P0", "PD", "PI", "PP", "PR", "PT", "PX"):
|
|
return (token, PRON)
|
|
return penntreebank2universal(*parole2penntreebank(token, tag))
|
|
|
|
ABBREVIATIONS = set((
|
|
"a.C.", "a.m.", "apdo.", "aprox.", "Av.", "Avda.", "c.c.", "D.", "Da.", "d.C.",
|
|
"d.j.C.", "dna.", "Dr.", "Dra.", "esq.", "etc.", "Gob.", "h.", "m.n.", "no.",
|
|
"núm.", "pág.", "P.D.", "P.S.", "p.ej.", "p.m.", "Profa.", "q.e.p.d.", "S.A.",
|
|
"S.L.", "Sr.", "Sra.", "Srta.", "s.s.s.", "tel.", "Ud.", "Vd.", "Uds.", "Vds.",
|
|
"v.", "vol.", "W.C."
|
|
))
|
|
|
|
|
|
def find_lemmata(tokens):
|
|
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
|
|
where each token is a [word, part-of-speech] list.
|
|
"""
|
|
for token in tokens:
|
|
word, pos, lemma = token[0], token[1], token[0]
|
|
if pos.startswith(("DT",)):
|
|
lemma = singularize(word, pos="DT")
|
|
if pos.startswith(("JJ",)):
|
|
lemma = predicative(word)
|
|
if pos == "NNS":
|
|
lemma = singularize(word)
|
|
if pos.startswith(("VB", "MD")):
|
|
lemma = conjugate(word, INFINITIVE) or word
|
|
token.append(lemma.lower())
|
|
return tokens
|
|
|
|
|
|
class Parser(_Parser):
|
|
|
|
def find_tokens(self, tokens, **kwargs):
|
|
kwargs.setdefault("abbreviations", ABBREVIATIONS)
|
|
kwargs.setdefault("replace", {})
|
|
return _Parser.find_tokens(self, tokens, **kwargs)
|
|
|
|
def find_lemmata(self, tokens, **kwargs):
|
|
return find_lemmata(tokens)
|
|
|
|
def find_tags(self, tokens, **kwargs):
|
|
if kwargs.get("tagset") in (PENN, None):
|
|
kwargs.setdefault("map", lambda token, tag: parole2penntreebank(token, tag))
|
|
if kwargs.get("tagset") == UNIVERSAL:
|
|
kwargs.setdefault("map", lambda token, tag: parole2universal(token, tag))
|
|
if kwargs.get("tagset") is PAROLE:
|
|
kwargs.setdefault("map", lambda token, tag: (token, tag))
|
|
return _Parser.find_tags(self, tokens, **kwargs)
|
|
|
|
parser = Parser(
|
|
lexicon = os.path.join(MODULE, "es-lexicon.txt"),
|
|
frequency = os.path.join(MODULE, "es-frequency.txt"),
|
|
morphology = os.path.join(MODULE, "es-morphology.txt"),
|
|
context = os.path.join(MODULE, "es-context.txt"),
|
|
default = ("NCS", "NP", "Z"),
|
|
language = "es"
|
|
)
|
|
|
|
lexicon = parser.lexicon # Expose lexicon.
|
|
|
|
spelling = Spelling(
|
|
path = os.path.join(MODULE, "es-spelling.txt")
|
|
)
|
|
|
|
|
|
def tokenize(s, *args, **kwargs):
|
|
""" Returns a list of sentences, where punctuation marks have been split from words.
|
|
"""
|
|
return parser.find_tokens(s, *args, **kwargs)
|
|
|
|
|
|
def parse(s, *args, **kwargs):
|
|
""" Returns a tagged Unicode string.
|
|
"""
|
|
return parser.parse(s, *args, **kwargs)
|
|
|
|
|
|
def parsetree(s, *args, **kwargs):
|
|
""" Returns a parsed Text from the given string.
|
|
"""
|
|
return Text(parse(s, *args, **kwargs))
|
|
|
|
|
|
def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
|
|
""" Returns a parsed Text from the given parsed string.
|
|
"""
|
|
return Text(s, token)
|
|
|
|
|
|
def tag(s, tokenize=True, encoding="utf-8", **kwargs):
|
|
""" Returns a list of (token, tag)-tuples from the given string.
|
|
"""
|
|
tags = []
|
|
for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():
|
|
for token in sentence:
|
|
tags.append((token[0], token[1]))
|
|
return tags
|
|
|
|
|
|
def keywords(s, top=10, **kwargs):
|
|
""" Returns a sorted list of keywords in the given string.
|
|
"""
|
|
return parser.find_keywords(s, **dict({
|
|
"frequency": parser.frequency,
|
|
"top": top,
|
|
"pos": ("NN",),
|
|
"ignore": ("rt",)}, **kwargs))
|
|
|
|
|
|
def suggest(w):
|
|
""" Returns a list of (word, confidence)-tuples of spelling corrections.
|
|
"""
|
|
return spelling.suggest(w)
|
|
|
|
split = tree # Backwards compatibility.
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
# python -m pattern.es xml -s "A quien se hace de miel las moscas le comen." -OTCL
|
|
|
|
if __name__ == "__main__":
|
|
commandline(parse)
|