bo-graduation/nltk-book/pattern-master/pattern/text/es/__init__.py

#### PATTERN | ES ##################################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2012 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

####################################################################################################
# Spanish linguistical tools using fast regular expressions.

from __future__ import unicode_literals
from __future__ import division

from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range

import os
import sys

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

# Import parser base classes.
from pattern.text import (
    Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,
    PUNCTUATION
)
# Import parser universal tagset.
from pattern.text import (
    penntreebank2universal,
    PTB, PENN, UNIVERSAL,
    NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X
)
# Import parse tree base classes.
from pattern.text.tree import (
    Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,
    SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR
)
# Import spelling base class.
from pattern.text import (
    Spelling
)
# Import verb tenses.
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    INDICATIVE, IMPERATIVE, SUBJUNCTIVE,
    IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
    IMPERFECT, PRETERITE,
    PARTICIPLE, GERUND
)
# Import inflection functions.
from pattern.text.es.inflect import (
    article, referenced, DEFINITE, INDEFINITE,
    MASCULINE, MALE, FEMININE, FEMALE, NEUTER, NEUTRAL, PLURAL, M, F, N, PL,
    pluralize, singularize, NOUN, VERB, ADJECTIVE,
    verbs, conjugate, lemma, lexeme, tenses,
    predicative, attributive
)
# Import all submodules.
from pattern.text.es import inflect

sys.path.pop(0)

#--- SPANISH PARSER --------------------------------------------------------------------------------
# The Spanish parser (accuracy 92%) is based on the Spanish portion Wikicorpus v.1.0 (FDL license),
# using 1.5M words from the tagged sections 10000-15000.
# Samuel Reese, Gemma Boleda, Montse Cuadros, Lluís Padró, German Rigau.
# Wikicorpus: A Word-Sense Disambiguated Multilingual Wikipedia Corpus.
# Proceedings of 7th Language Resources and Evaluation Conference (LREC'10),
# La Valleta, Malta. May, 2010.
# http://www.lsi.upc.edu/~nlp/wikicorpus/

# The lexicon uses the Parole tagset:
# http://www.lsi.upc.edu/~nlp/SVMTool/parole.html
# http://nlp.lsi.upc.edu/freeling/doc/tagsets/tagset-es.html
PAROLE = "parole"
parole = {
    "AO": "JJ",   # primera
    "AQ": "JJ",   # absurdo
    "CC": "CC",   # e
    "CS": "IN",   # porque
    "DA": "DT",   # el
    "DD": "DT",   # ese
    "DI": "DT",   # mucha
    "DP": "PRP$", # mi, nuestra
    "DT": "DT",   # cuántos
    "Fa": ".",    # !
    "Fc": ",",    # ,
    "Fd": ":",    # :
    "Fe": "\"",   # "
    "Fg": ".",    # -
    "Fh": ".",    # /
    "Fi": ".",    # ?
    "Fp": ".",    # .
    "Fr": ".",    # >>
    "Fs": ".",    # ...
   "Fpa": "(",    # (
   "Fpt": ")",    # )
    "Fx": ".",    # ;
    "Fz": ".",    #
     "I": "UH",   # ehm
    "NC": "NN",   # islam
   "NCS": "NN",   # guitarra
   "NCP": "NNS",  # guitarras
    "NP": "NNP",  # Óscar
    "P0": "PRP",  # se
    "PD": "DT",   # ése
    "PI": "DT",   # uno
    "PP": "PRP",  # vos
    "PR": "WP$",  # qué
    "PT": "WP$",  # qué
    "PX": "PRP$", # mío
    "RG": "RB",   # tecnológicamente
    "RN": "RB",   # no
    "SP": "IN",   # por
   "VAG": "VBG",  # habiendo
   "VAI": "MD",   # había
   "VAN": "MD",   # haber
   "VAS": "MD",   # haya
   "VMG": "VBG",  # habiendo
   "VMI": "VB",   # habemos
   "VMM": "VB",   # compare
   "VMN": "VB",   # comparecer
   "VMP": "VBN",  # comparando
   "VMS": "VB",   # compararan
   "VSG": "VBG",  # comparando
   "VSI": "VB",   # será
   "VSN": "VB",   # ser
   "VSP": "VBN",  # sido
   "VSS": "VB",   # sea
     "W": "NN",   # septiembre
     "Z": "CD",   # 1,7
    "Zd": "CD",   # 1,7
    "Zm": "CD",   # £1,7
    "Zp": "CD",   # 1,7%
}


def parole2penntreebank(token, tag):
    """ Converts a Parole tag to a Penn Treebank II tag.
        For example: importantísimo/AQ => importantísimo/ADJ
    """
    return (token, parole.get(tag, tag))


def parole2universal(token, tag):
    """ Converts a Parole tag to a universal tag.
        For example: importantísimo/AQ => importantísimo/ADJ
    """
    if tag == "CS":
        return (token, CONJ)
    if tag == "DP":
        return (token, DET)
    if tag in ("P0", "PD", "PI", "PP", "PR", "PT", "PX"):
        return (token, PRON)
    return penntreebank2universal(*parole2penntreebank(token, tag))

ABBREVIATIONS = set((
    "a.C.", "a.m.", "apdo.", "aprox.", "Av.", "Avda.", "c.c.", "D.", "Da.", "d.C.",
    "d.j.C.", "dna.", "Dr.", "Dra.", "esq.", "etc.", "Gob.", "h.", "m.n.", "no.",
    "núm.", "pág.", "P.D.", "P.S.", "p.ej.", "p.m.", "Profa.", "q.e.p.d.", "S.A.",
    "S.L.", "Sr.", "Sra.", "Srta.", "s.s.s.", "tel.", "Ud.", "Vd.", "Uds.", "Vds.",
    "v.", "vol.", "W.C."
))


def find_lemmata(tokens):
    """ Annotates the tokens with lemmata for plural nouns and conjugated verbs,
        where each token is a [word, part-of-speech] list.
    """
    for token in tokens:
        word, pos, lemma = token[0], token[1], token[0]
        if pos.startswith(("DT",)):
            lemma = singularize(word, pos="DT")
        if pos.startswith(("JJ",)):
            lemma = predicative(word)
        if pos == "NNS":
            lemma = singularize(word)
        if pos.startswith(("VB", "MD")):
            lemma = conjugate(word, INFINITIVE) or word
        token.append(lemma.lower())
    return tokens


class Parser(_Parser):

    def find_tokens(self, tokens, **kwargs):
        kwargs.setdefault("abbreviations", ABBREVIATIONS)
        kwargs.setdefault("replace", {})
        return _Parser.find_tokens(self, tokens, **kwargs)

    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)

    def find_tags(self, tokens, **kwargs):
        if kwargs.get("tagset") in (PENN, None):
            kwargs.setdefault("map", lambda token, tag: parole2penntreebank(token, tag))
        if kwargs.get("tagset") == UNIVERSAL:
            kwargs.setdefault("map", lambda token, tag: parole2universal(token, tag))
        if kwargs.get("tagset") is PAROLE:
            kwargs.setdefault("map", lambda token, tag: (token, tag))
        return _Parser.find_tags(self, tokens, **kwargs)

parser = Parser(
     lexicon = os.path.join(MODULE, "es-lexicon.txt"),
   frequency = os.path.join(MODULE, "es-frequency.txt"),
  morphology = os.path.join(MODULE, "es-morphology.txt"),
     context = os.path.join(MODULE, "es-context.txt"),
     default = ("NCS", "NP", "Z"),
    language = "es"
)

lexicon = parser.lexicon # Expose lexicon.

spelling = Spelling(
        path = os.path.join(MODULE, "es-spelling.txt")
)


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)


def parsetree(s, *args, **kwargs):
    """ Returns a parsed Text from the given string.
    """
    return Text(parse(s, *args, **kwargs))


def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
    """ Returns a parsed Text from the given parsed string.
    """
    return Text(s, token)


def tag(s, tokenize=True, encoding="utf-8", **kwargs):
    """ Returns a list of (token, tag)-tuples from the given string.
    """
    tags = []
    for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():
        for token in sentence:
            tags.append((token[0], token[1]))
    return tags


def keywords(s, top=10, **kwargs):
    """ Returns a sorted list of keywords in the given string.
    """
    return parser.find_keywords(s, **dict({
        "frequency": parser.frequency,
              "top": top,
              "pos": ("NN",),
           "ignore": ("rt",)}, **kwargs))


def suggest(w):
    """ Returns a list of (word, confidence)-tuples of spelling corrections.
    """
    return spelling.suggest(w)

split = tree # Backwards compatibility.

#---------------------------------------------------------------------------------------------------
# python -m pattern.es xml -s "A quien se hace de miel las moscas le comen." -OTCL

if __name__ == "__main__":
    commandline(parse)
thrid updates 5 years ago			`#### PATTERN \| ES ##################################################################################`
			`# -- coding: utf-8 --`
			`# Copyright (c) 2012 University of Antwerp, Belgium`
			`# Author: Tom De Smedt <tom@organisms.be>`
			`# License: BSD (see LICENSE.txt for details).`
			`# http://www.clips.ua.ac.be/pages/pattern`

			`####################################################################################################`
			`# Spanish linguistical tools using fast regular expressions.`

			`from __future__ import unicode_literals`
			`from __future__ import division`

			`from builtins import str, bytes, dict, int`
			`from builtins import map, zip, filter`
			`from builtins import object, range`

			`import os`
			`import sys`

			`try:`
			`MODULE = os.path.dirname(os.path.realpath(__file__))`
			`except:`
			`MODULE = ""`

			`sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))`

			`# Import parser base classes.`
			`from pattern.text import (`
			`Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,`
			`PUNCTUATION`
			`)`
			`# Import parser universal tagset.`
			`from pattern.text import (`
			`penntreebank2universal,`
			`PTB, PENN, UNIVERSAL,`
			`NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X`
			`)`
			`# Import parse tree base classes.`
			`from pattern.text.tree import (`
			`Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,`
			`SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR`
			`)`
			`# Import spelling base class.`
			`from pattern.text import (`
			`Spelling`
			`)`
			`# Import verb tenses.`
			`from pattern.text import (`
			`INFINITIVE, PRESENT, PAST, FUTURE, CONDITIONAL,`
			`FIRST, SECOND, THIRD,`
			`SINGULAR, PLURAL, SG, PL,`
			`INDICATIVE, IMPERATIVE, SUBJUNCTIVE,`
			`IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,`
			`IMPERFECT, PRETERITE,`
			`PARTICIPLE, GERUND`
			`)`
			`# Import inflection functions.`
			`from pattern.text.es.inflect import (`
			`article, referenced, DEFINITE, INDEFINITE,`
			`MASCULINE, MALE, FEMININE, FEMALE, NEUTER, NEUTRAL, PLURAL, M, F, N, PL,`
			`pluralize, singularize, NOUN, VERB, ADJECTIVE,`
			`verbs, conjugate, lemma, lexeme, tenses,`
			`predicative, attributive`
			`)`
			`# Import all submodules.`
			`from pattern.text.es import inflect`

			`sys.path.pop(0)`

			`#--- SPANISH PARSER --------------------------------------------------------------------------------`
			`# The Spanish parser (accuracy 92%) is based on the Spanish portion Wikicorpus v.1.0 (FDL license),`
			`# using 1.5M words from the tagged sections 10000-15000.`
			`# Samuel Reese, Gemma Boleda, Montse Cuadros, Lluís Padró, German Rigau.`
			`# Wikicorpus: A Word-Sense Disambiguated Multilingual Wikipedia Corpus.`
			`# Proceedings of 7th Language Resources and Evaluation Conference (LREC'10),`
			`# La Valleta, Malta. May, 2010.`
			`# http://www.lsi.upc.edu/~nlp/wikicorpus/`

			`# The lexicon uses the Parole tagset:`
			`# http://www.lsi.upc.edu/~nlp/SVMTool/parole.html`
			`# http://nlp.lsi.upc.edu/freeling/doc/tagsets/tagset-es.html`
			`PAROLE = "parole"`
			`parole = {`
			`"AO": "JJ", # primera`
			`"AQ": "JJ", # absurdo`
			`"CC": "CC", # e`
			`"CS": "IN", # porque`
			`"DA": "DT", # el`
			`"DD": "DT", # ese`
			`"DI": "DT", # mucha`
			`"DP": "PRP$", # mi, nuestra`
			`"DT": "DT", # cuántos`
			`"Fa": ".", # !`
			`"Fc": ",", # ,`
			`"Fd": ":", # :`
			`"Fe": "\"", # "`
			`"Fg": ".", # -`
			`"Fh": ".", # /`
			`"Fi": ".", # ?`
			`"Fp": ".", # .`
			`"Fr": ".", # >>`
			`"Fs": ".", # ...`
			`"Fpa": "(", # (`
			`"Fpt": ")", # )`
			`"Fx": ".", # ;`
			`"Fz": ".", #`
			`"I": "UH", # ehm`
			`"NC": "NN", # islam`
			`"NCS": "NN", # guitarra`
			`"NCP": "NNS", # guitarras`
			`"NP": "NNP", # Óscar`
			`"P0": "PRP", # se`
			`"PD": "DT", # ése`
			`"PI": "DT", # uno`
			`"PP": "PRP", # vos`
			`"PR": "WP$", # qué`
			`"PT": "WP$", # qué`
			`"PX": "PRP$", # mío`
			`"RG": "RB", # tecnológicamente`
			`"RN": "RB", # no`
			`"SP": "IN", # por`
			`"VAG": "VBG", # habiendo`
			`"VAI": "MD", # había`
			`"VAN": "MD", # haber`
			`"VAS": "MD", # haya`
			`"VMG": "VBG", # habiendo`
			`"VMI": "VB", # habemos`
			`"VMM": "VB", # compare`
			`"VMN": "VB", # comparecer`
			`"VMP": "VBN", # comparando`
			`"VMS": "VB", # compararan`
			`"VSG": "VBG", # comparando`
			`"VSI": "VB", # será`
			`"VSN": "VB", # ser`
			`"VSP": "VBN", # sido`
			`"VSS": "VB", # sea`
			`"W": "NN", # septiembre`
			`"Z": "CD", # 1,7`
			`"Zd": "CD", # 1,7`
			`"Zm": "CD", # £1,7`
			`"Zp": "CD", # 1,7%`
			`}`


			`def parole2penntreebank(token, tag):`
			`""" Converts a Parole tag to a Penn Treebank II tag.`
			`For example: importantísimo/AQ => importantísimo/ADJ`
			`"""`
			`return (token, parole.get(tag, tag))`


			`def parole2universal(token, tag):`
			`""" Converts a Parole tag to a universal tag.`
			`For example: importantísimo/AQ => importantísimo/ADJ`
			`"""`
			`if tag == "CS":`
			`return (token, CONJ)`
			`if tag == "DP":`
			`return (token, DET)`
			`if tag in ("P0", "PD", "PI", "PP", "PR", "PT", "PX"):`
			`return (token, PRON)`
			`return penntreebank2universal(*parole2penntreebank(token, tag))`

			`ABBREVIATIONS = set((`
			`"a.C.", "a.m.", "apdo.", "aprox.", "Av.", "Avda.", "c.c.", "D.", "Da.", "d.C.",`
			`"d.j.C.", "dna.", "Dr.", "Dra.", "esq.", "etc.", "Gob.", "h.", "m.n.", "no.",`
			`"núm.", "pág.", "P.D.", "P.S.", "p.ej.", "p.m.", "Profa.", "q.e.p.d.", "S.A.",`
			`"S.L.", "Sr.", "Sra.", "Srta.", "s.s.s.", "tel.", "Ud.", "Vd.", "Uds.", "Vds.",`
			`"v.", "vol.", "W.C."`
			`))`


			`def find_lemmata(tokens):`
			`""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,`
			`where each token is a [word, part-of-speech] list.`
			`"""`
			`for token in tokens:`
			`word, pos, lemma = token[0], token[1], token[0]`
			`if pos.startswith(("DT",)):`
			`lemma = singularize(word, pos="DT")`
			`if pos.startswith(("JJ",)):`
			`lemma = predicative(word)`
			`if pos == "NNS":`
			`lemma = singularize(word)`
			`if pos.startswith(("VB", "MD")):`
			`lemma = conjugate(word, INFINITIVE) or word`
			`token.append(lemma.lower())`
			`return tokens`


			`class Parser(_Parser):`

			`def find_tokens(self, tokens, **kwargs):`
			`kwargs.setdefault("abbreviations", ABBREVIATIONS)`
			`kwargs.setdefault("replace", {})`
			`return _Parser.find_tokens(self, tokens, **kwargs)`

			`def find_lemmata(self, tokens, **kwargs):`
			`return find_lemmata(tokens)`

			`def find_tags(self, tokens, **kwargs):`
			`if kwargs.get("tagset") in (PENN, None):`
			`kwargs.setdefault("map", lambda token, tag: parole2penntreebank(token, tag))`
			`if kwargs.get("tagset") == UNIVERSAL:`
			`kwargs.setdefault("map", lambda token, tag: parole2universal(token, tag))`
			`if kwargs.get("tagset") is PAROLE:`
			`kwargs.setdefault("map", lambda token, tag: (token, tag))`
			`return _Parser.find_tags(self, tokens, **kwargs)`

			`parser = Parser(`
			`lexicon = os.path.join(MODULE, "es-lexicon.txt"),`
			`frequency = os.path.join(MODULE, "es-frequency.txt"),`
			`morphology = os.path.join(MODULE, "es-morphology.txt"),`
			`context = os.path.join(MODULE, "es-context.txt"),`
			`default = ("NCS", "NP", "Z"),`
			`language = "es"`
			`)`

			`lexicon = parser.lexicon # Expose lexicon.`

			`spelling = Spelling(`
			`path = os.path.join(MODULE, "es-spelling.txt")`
			`)`


			`def tokenize(s, args, *kwargs):`
			`""" Returns a list of sentences, where punctuation marks have been split from words.`
			`"""`
			`return parser.find_tokens(s, args, *kwargs)`


			`def parse(s, args, *kwargs):`
			`""" Returns a tagged Unicode string.`
			`"""`
			`return parser.parse(s, args, *kwargs)`


			`def parsetree(s, args, *kwargs):`
			`""" Returns a parsed Text from the given string.`
			`"""`
			`return Text(parse(s, args, *kwargs))`


			`def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):`
			`""" Returns a parsed Text from the given parsed string.`
			`"""`
			`return Text(s, token)`


			`def tag(s, tokenize=True, encoding="utf-8", **kwargs):`
			`""" Returns a list of (token, tag)-tuples from the given string.`
			`"""`
			`tags = []`
			`for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():`
			`for token in sentence:`
			`tags.append((token[0], token[1]))`
			`return tags`


			`def keywords(s, top=10, **kwargs):`
			`""" Returns a sorted list of keywords in the given string.`
			`"""`
			`return parser.find_keywords(s, **dict({`
			`"frequency": parser.frequency,`
			`"top": top,`
			`"pos": ("NN",),`
			`"ignore": ("rt",)}, **kwargs))`


			`def suggest(w):`
			`""" Returns a list of (word, confidence)-tuples of spelling corrections.`
			`"""`
			`return spelling.suggest(w)`

			`split = tree # Backwards compatibility.`

			`#---------------------------------------------------------------------------------------------------`
			`# python -m pattern.es xml -s "A quien se hace de miel las moscas le comen." -OTCL`

			`if __name__ == "__main__":`
			`commandline(parse)`