bo-graduation/nltk-book/pattern-master/pattern/text/xx/__init__.py

#### PATTERN | XX ##################################################################################
# -*- coding: utf-8 -*-
# Copyright (c) year, institute, country
# Author: Name (e-mail)
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

####################################################################################################
# Template for pattern.xx, bundling natural language processing tools for language XXXXX.
# The module bundles a shallow parser (part-of-speech tagger, chunker, lemmatizer)
# with functions for word inflection (singularization, pluralization, conjugation)
# and sentiment analysis.

# Base classes for the parser, verb table and sentiment lexicon are inherited from pattern.text.
# The parser can be subclassed with a custom tokenizer (finds sentence boundaries)
# and lemmatizer (uses word inflection to find the base form of words).
# The part-of-speech tagger requires a lexicon of tagged known words and rules for unknown words.

# Tools for word inflection should be bundled in pattern.text.xx.inflect.

from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division

from builtins import str, bytes, dict, int
from builtins import map, zip, filter
from builtins import object, range

import os
import sys

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))

# Import parser base classes.
from pattern.text import (
    Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,
    PUNCTUATION
)
# Import parse tree base classes.
from pattern.text.tree import (
    Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,
    SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR
)
# Import sentiment analysis base classes.
from pattern.text import (
    Sentiment,
    NOUN, VERB, ADJECTIVE, ADVERB,
    MOOD, IRONY
)
# Import spelling base class.
from pattern.text import (
    Spelling
)
# Import verb tenses.
from pattern.text import (
    INFINITIVE, PRESENT, PAST, FUTURE,
    FIRST, SECOND, THIRD,
    SINGULAR, PLURAL, SG, PL,
    PROGRESSIVE,
    PARTICIPLE
)
# Import inflection functions.
from pattern.text.xx.inflect import (
    article, referenced, DEFINITE, INDEFINITE,
    pluralize, singularize, NOUN, VERB, ADJECTIVE,
    verbs, conjugate, lemma, lexeme, tenses,
    predicative, attributive
)
# Import all submodules.
from pattern.text.xx import inflect

sys.path.pop(0)

#--- PARSER ----------------------------------------------------------------------------------------

# Pattern uses the Penn Treebank II tagset (http://www.clips.ua.ac.be/pages/penn-treebank-tagset).
# The lexicon for pattern.xx may be using a different tagset (e.g., PAROLE, WOTAN).
# The following functions are meant to map the tags to Penn Treebank II, see Parser.find_chunks().

TAGSET = {"??": "NN"} # pattern.xx tagset => Penn Treebank II.


def tagset2penntreebank(tag):
    return TAGSET.get(tag, tag)

# Different languages have different contractions (e.g., English "I've" or French "j'ai")
# and abbreviations. The following functions define contractions and abbreviations
# for pattern.xx, see also Parser.find_tokens().

REPLACEMENTS = {"'s": " 's", "'ve": " 've"}
ABBREVIATIONS = set(("e.g.", "etc.", "i.e."))

# A lemmatizer can be constructed if we have a pattern.xx.inflect,
# with functions for noun singularization and verb conjugation (i.e., infinitives).


def find_lemmata(tokens):
    """ Annotates the tokens with lemmata for plural nouns and conjugated verbs,
        where each token is a [word, part-of-speech] list.
    """
    for token in tokens:
        word, pos, lemma = token[0], token[1], token[0]
        if pos.startswith("JJ"):
            lemma = predicative(word)
        if pos == "NNS":
            lemma = singularize(word)
        if pos.startswith(("VB", "MD")):
            lemma = conjugate(word, INFINITIVE) or word
        token.append(lemma.lower())
    return tokens

# Subclass the base parser with the language-specific functionality:


class Parser(_Parser):

    def find_tokens(self, tokens, **kwargs):
        kwargs.setdefault("abbreviations", ABBREVIATIONS)
        kwargs.setdefault("replace", REPLACEMENTS)
        return _Parser.find_tokens(self, tokens, **kwargs)

    def find_tags(self, tokens, **kwargs):
        kwargs.setdefault("map", tagset2penntreebank)
        return _Parser.find_tags(self, tokens, **kwargs)

    def find_chunks(self, tokens, **kwargs):
        return _Parser.find_chunks(self, tokens, **kwargs)

    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)

# The parser's part-of-speech tagger requires a lexicon of tagged known words,
# and rules for unknown words. See pattern.text.Morphology and pattern.text.Context
# for further details. A tutorial on how to acquire data for the lexicon is here:
# http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger

# Create the parser with default tags for unknown words:
# (noun, proper noun, numeric).

parser = Parser(
     lexicon = os.path.join(MODULE, "xx-lexicon.txt"),    # A dict of known words => most frequent tag.
   frequency = os.path.join(MODULE, "xx-frequency.txt"),  # A dict of word frequency.
  morphology = os.path.join(MODULE, "xx-morphology.txt"), # A set of suffix rules.
     context = os.path.join(MODULE, "xx-context.txt"),    # A set of contextual rules.
    entities = os.path.join(MODULE, "xx-entities.txt"),   # A dict of named entities: John = NNP-PERS.
     default = ("NN", "NNP", "CD"),
    language = "xx"
)

lexicon = parser.lexicon # Expose lexicon.

# Create the sentiment lexicon,
# see pattern/text/xx/xx-sentiment.xml for further details.
# We also need to define the tag for modifiers,
# words that modify the score of the following word
# (e.g., *very* good, *not good, ...)

sentiment = Sentiment(
        path = os.path.join(MODULE, "xx-sentiment.xml"),
      synset = None,
   negations = ("no", "not", "never"),
   modifiers = ("RB",),
   modifier = lambda w: w.endswith("ly"), # brilliantly, hardly, partially, ...
    language = "xx"
)

# Nothing should be changed below.


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)


def parsetree(s, *args, **kwargs):
    """ Returns a parsed Text from the given string.
    """
    return Text(parse(s, *args, **kwargs))


def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
    """ Returns a parsed Text from the given parsed string.
    """
    return Text(s, token)


def tag(s, tokenize=True, encoding="utf-8", **kwargs):
    """ Returns a list of (token, tag)-tuples from the given string.
    """
    tags = []
    for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():
        for token in sentence:
            tags.append((token[0], token[1]))
    return tags


def keywords(s, top=10, **kwargs):
    """ Returns a sorted list of keywords in the given string.
    """
    return parser.find_keywords(s, **dict({
        "frequency": parser.frequency,
              "top": top,
              "pos": ("NN",),
           "ignore": ("rt",)}, **kwargs))


def polarity(s, **kwargs):
    """ Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
    """
    return sentiment(s, **kwargs)[0]


def subjectivity(s, **kwargs):
    """ Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
    """
    return sentiment(s, **kwargs)[1]


def positive(s, threshold=0.1, **kwargs):
    """ Returns True if the given sentence has a positive sentiment.
    """
    return polarity(s, **kwargs) >= threshold

split = tree # Backwards compatibility.

#---------------------------------------------------------------------------------------------------
# python -m pattern.xx xml -s "..." -OTCL

if __name__ == "__main__":
    commandline(parse)
thrid updates 5 years ago			`#### PATTERN \| XX ##################################################################################`
			`# -- coding: utf-8 --`
			`# Copyright (c) year, institute, country`
			`# Author: Name (e-mail)`
			`# License: BSD (see LICENSE.txt for details).`
			`# http://www.clips.ua.ac.be/pages/pattern`

			`####################################################################################################`
			`# Template for pattern.xx, bundling natural language processing tools for language XXXXX.`
			`# The module bundles a shallow parser (part-of-speech tagger, chunker, lemmatizer)`
			`# with functions for word inflection (singularization, pluralization, conjugation)`
			`# and sentiment analysis.`

			`# Base classes for the parser, verb table and sentiment lexicon are inherited from pattern.text.`
			`# The parser can be subclassed with a custom tokenizer (finds sentence boundaries)`
			`# and lemmatizer (uses word inflection to find the base form of words).`
			`# The part-of-speech tagger requires a lexicon of tagged known words and rules for unknown words.`

			`# Tools for word inflection should be bundled in pattern.text.xx.inflect.`

			`from __future__ import unicode_literals`
			`from __future__ import print_function`
			`from __future__ import division`

			`from builtins import str, bytes, dict, int`
			`from builtins import map, zip, filter`
			`from builtins import object, range`

			`import os`
			`import sys`

			`try:`
			`MODULE = os.path.dirname(os.path.realpath(__file__))`
			`except:`
			`MODULE = ""`

			`sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))`

			`# Import parser base classes.`
			`from pattern.text import (`
			`Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,`
			`PUNCTUATION`
			`)`
			`# Import parse tree base classes.`
			`from pattern.text.tree import (`
			`Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,`
			`SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR`
			`)`
			`# Import sentiment analysis base classes.`
			`from pattern.text import (`
			`Sentiment,`
			`NOUN, VERB, ADJECTIVE, ADVERB,`
			`MOOD, IRONY`
			`)`
			`# Import spelling base class.`
			`from pattern.text import (`
			`Spelling`
			`)`
			`# Import verb tenses.`
			`from pattern.text import (`
			`INFINITIVE, PRESENT, PAST, FUTURE,`
			`FIRST, SECOND, THIRD,`
			`SINGULAR, PLURAL, SG, PL,`
			`PROGRESSIVE,`
			`PARTICIPLE`
			`)`
			`# Import inflection functions.`
			`from pattern.text.xx.inflect import (`
			`article, referenced, DEFINITE, INDEFINITE,`
			`pluralize, singularize, NOUN, VERB, ADJECTIVE,`
			`verbs, conjugate, lemma, lexeme, tenses,`
			`predicative, attributive`
			`)`
			`# Import all submodules.`
			`from pattern.text.xx import inflect`

			`sys.path.pop(0)`

			`#--- PARSER ----------------------------------------------------------------------------------------`

			`# Pattern uses the Penn Treebank II tagset (http://www.clips.ua.ac.be/pages/penn-treebank-tagset).`
			`# The lexicon for pattern.xx may be using a different tagset (e.g., PAROLE, WOTAN).`
			`# The following functions are meant to map the tags to Penn Treebank II, see Parser.find_chunks().`

			`TAGSET = {"??": "NN"} # pattern.xx tagset => Penn Treebank II.`


			`def tagset2penntreebank(tag):`
			`return TAGSET.get(tag, tag)`

			`# Different languages have different contractions (e.g., English "I've" or French "j'ai")`
			`# and abbreviations. The following functions define contractions and abbreviations`
			`# for pattern.xx, see also Parser.find_tokens().`

			`REPLACEMENTS = {"'s": " 's", "'ve": " 've"}`
			`ABBREVIATIONS = set(("e.g.", "etc.", "i.e."))`

			`# A lemmatizer can be constructed if we have a pattern.xx.inflect,`
			`# with functions for noun singularization and verb conjugation (i.e., infinitives).`


			`def find_lemmata(tokens):`
			`""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,`
			`where each token is a [word, part-of-speech] list.`
			`"""`
			`for token in tokens:`
			`word, pos, lemma = token[0], token[1], token[0]`
			`if pos.startswith("JJ"):`
			`lemma = predicative(word)`
			`if pos == "NNS":`
			`lemma = singularize(word)`
			`if pos.startswith(("VB", "MD")):`
			`lemma = conjugate(word, INFINITIVE) or word`
			`token.append(lemma.lower())`
			`return tokens`

			`# Subclass the base parser with the language-specific functionality:`


			`class Parser(_Parser):`

			`def find_tokens(self, tokens, **kwargs):`
			`kwargs.setdefault("abbreviations", ABBREVIATIONS)`
			`kwargs.setdefault("replace", REPLACEMENTS)`
			`return _Parser.find_tokens(self, tokens, **kwargs)`

			`def find_tags(self, tokens, **kwargs):`
			`kwargs.setdefault("map", tagset2penntreebank)`
			`return _Parser.find_tags(self, tokens, **kwargs)`

			`def find_chunks(self, tokens, **kwargs):`
			`return _Parser.find_chunks(self, tokens, **kwargs)`

			`def find_lemmata(self, tokens, **kwargs):`
			`return find_lemmata(tokens)`

			`# The parser's part-of-speech tagger requires a lexicon of tagged known words,`
			`# and rules for unknown words. See pattern.text.Morphology and pattern.text.Context`
			`# for further details. A tutorial on how to acquire data for the lexicon is here:`
			`# http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger`

			`# Create the parser with default tags for unknown words:`
			`# (noun, proper noun, numeric).`

			`parser = Parser(`
			`lexicon = os.path.join(MODULE, "xx-lexicon.txt"), # A dict of known words => most frequent tag.`
			`frequency = os.path.join(MODULE, "xx-frequency.txt"), # A dict of word frequency.`
			`morphology = os.path.join(MODULE, "xx-morphology.txt"), # A set of suffix rules.`
			`context = os.path.join(MODULE, "xx-context.txt"), # A set of contextual rules.`
			`entities = os.path.join(MODULE, "xx-entities.txt"), # A dict of named entities: John = NNP-PERS.`
			`default = ("NN", "NNP", "CD"),`
			`language = "xx"`
			`)`

			`lexicon = parser.lexicon # Expose lexicon.`

			`# Create the sentiment lexicon,`
			`# see pattern/text/xx/xx-sentiment.xml for further details.`
			`# We also need to define the tag for modifiers,`
			`# words that modify the score of the following word`
			`# (e.g., very good, *not good, ...)`

			`sentiment = Sentiment(`
			`path = os.path.join(MODULE, "xx-sentiment.xml"),`
			`synset = None,`
			`negations = ("no", "not", "never"),`
			`modifiers = ("RB",),`
			`modifier = lambda w: w.endswith("ly"), # brilliantly, hardly, partially, ...`
			`language = "xx"`
			`)`

			`# Nothing should be changed below.`


			`def tokenize(s, args, *kwargs):`
			`""" Returns a list of sentences, where punctuation marks have been split from words.`
			`"""`
			`return parser.find_tokens(s, args, *kwargs)`


			`def parse(s, args, *kwargs):`
			`""" Returns a tagged Unicode string.`
			`"""`
			`return parser.parse(s, args, *kwargs)`


			`def parsetree(s, args, *kwargs):`
			`""" Returns a parsed Text from the given string.`
			`"""`
			`return Text(parse(s, args, *kwargs))`


			`def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):`
			`""" Returns a parsed Text from the given parsed string.`
			`"""`
			`return Text(s, token)`


			`def tag(s, tokenize=True, encoding="utf-8", **kwargs):`
			`""" Returns a list of (token, tag)-tuples from the given string.`
			`"""`
			`tags = []`
			`for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():`
			`for token in sentence:`
			`tags.append((token[0], token[1]))`
			`return tags`


			`def keywords(s, top=10, **kwargs):`
			`""" Returns a sorted list of keywords in the given string.`
			`"""`
			`return parser.find_keywords(s, **dict({`
			`"frequency": parser.frequency,`
			`"top": top,`
			`"pos": ("NN",),`
			`"ignore": ("rt",)}, **kwargs))`


			`def polarity(s, **kwargs):`
			`""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.`
			`"""`
			`return sentiment(s, **kwargs)[0]`


			`def subjectivity(s, **kwargs):`
			`""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.`
			`"""`
			`return sentiment(s, **kwargs)[1]`


			`def positive(s, threshold=0.1, **kwargs):`
			`""" Returns True if the given sentence has a positive sentiment.`
			`"""`
			`return polarity(s, **kwargs) >= threshold`

			`split = tree # Backwards compatibility.`

			`#---------------------------------------------------------------------------------------------------`
			`# python -m pattern.xx xml -s "..." -OTCL`

			`if __name__ == "__main__":`
			`commandline(parse)`