|
|
#### PATTERN | FR ##################################################################################
|
|
|
# -*- coding: utf-8 -*-
|
|
|
# Copyright (c) 2013 University of Antwerp, Belgium
|
|
|
# Copyright (c) 2013 St. Lucas University College of Art & Design, Antwerp.
|
|
|
# Author: Tom De Smedt <tom@organisms.be>
|
|
|
# License: BSD (see LICENSE.txt for details).
|
|
|
# http://www.clips.ua.ac.be/pages/pattern
|
|
|
|
|
|
####################################################################################################
|
|
|
# French linguistical tools using fast regular expressions.
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
from __future__ import division
|
|
|
|
|
|
from builtins import str, bytes, dict, int
|
|
|
from builtins import map, zip, filter
|
|
|
from builtins import object, range
|
|
|
|
|
|
import os
|
|
|
import sys
|
|
|
|
|
|
try:
|
|
|
MODULE = os.path.dirname(os.path.realpath(__file__))
|
|
|
except:
|
|
|
MODULE = ""
|
|
|
|
|
|
sys.path.insert(0, os.path.join(MODULE, "..", "..", "..", ".."))
|
|
|
|
|
|
# Import parser base classes.
|
|
|
from pattern.text import (
|
|
|
Lexicon, Model, Morphology, Context, Parser as _Parser, ngrams, pprint, commandline,
|
|
|
PUNCTUATION
|
|
|
)
|
|
|
# Import parser universal tagset.
|
|
|
from pattern.text import (
|
|
|
penntreebank2universal as _penntreebank2universal,
|
|
|
PTB, PENN, UNIVERSAL,
|
|
|
NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X
|
|
|
)
|
|
|
# Import parse tree base classes.
|
|
|
from pattern.text.tree import (
|
|
|
Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table,
|
|
|
SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR
|
|
|
)
|
|
|
# Import sentiment analysis base classes.
|
|
|
from pattern.text import (
|
|
|
Sentiment as _Sentiment,
|
|
|
NOUN, VERB, ADJECTIVE, ADVERB,
|
|
|
MOOD, IRONY
|
|
|
)
|
|
|
# Import spelling base class.
|
|
|
from pattern.text import (
|
|
|
Spelling
|
|
|
)
|
|
|
# Import verb tenses.
|
|
|
from pattern.text import (
|
|
|
INFINITIVE, PRESENT, PAST, FUTURE,
|
|
|
FIRST, SECOND, THIRD,
|
|
|
SINGULAR, PLURAL, SG, PL,
|
|
|
INDICATIVE, IMPERATIVE, SUBJUNCTIVE, CONDITIONAL,
|
|
|
IMPERFECTIVE, PERFECTIVE, PROGRESSIVE,
|
|
|
IMPERFECT, PRETERITE,
|
|
|
PARTICIPLE, GERUND
|
|
|
)
|
|
|
# Import inflection functions.
|
|
|
from pattern.text.fr.inflect import (
|
|
|
pluralize, singularize, NOUN, VERB, ADJECTIVE,
|
|
|
verbs, conjugate, lemma, lexeme, tenses,
|
|
|
predicative, attributive
|
|
|
)
|
|
|
# Import all submodules.
|
|
|
from pattern.text.fr import inflect
|
|
|
|
|
|
sys.path.pop(0)
|
|
|
|
|
|
#--- FRENCH PARSER ---------------------------------------------------------------------------------
|
|
|
# The French parser is based on Lefff (Lexique des Formes Fléchies du Français).
|
|
|
# Benoît Sagot, Lionel Clément, Érice Villemonte de la Clergerie, Pierre Boullier.
|
|
|
# The Lefff 2 syntactic lexicon for French: architecture, acquisition.
|
|
|
# http://alpage.inria.fr/~sagot/lefff-en.html
|
|
|
|
|
|
# For words in Lefff that can have different part-of-speech tags,
|
|
|
# we used Lexique to find the most frequent POS-tag:
|
|
|
# http://www.lexique.org/
|
|
|
|
|
|
_subordinating_conjunctions = set((
|
|
|
"afin", "comme", "lorsque", "parce", "puisque", "quand", "que", "quoique", "si"
|
|
|
))
|
|
|
|
|
|
|
|
|
def penntreebank2universal(token, tag):
|
|
|
""" Converts a Penn Treebank II tag to a universal tag.
|
|
|
For example: comme/IN => comme/CONJ
|
|
|
"""
|
|
|
if tag == "IN" and token.lower() in _subordinating_conjunctions:
|
|
|
return CONJ
|
|
|
return _penntreebank2universal(token, tag)
|
|
|
|
|
|
ABBREVIATIONS = set((
|
|
|
"av.", "boul.", "C.-B.", "c.-à-d.", "ex.", "éd.", "fig.", "I.-P.-E.", "J.-C.",
|
|
|
"Ltee.", "Ltée.", "M.", "Me.", "Mlle.", "Mlles.", "MM.", "N.-B.", "N.-É.", "p.",
|
|
|
"S.B.E.", "Ste.", "T.-N.", "t.a.b."
|
|
|
))
|
|
|
|
|
|
# While contractions in English are optional,
|
|
|
# they are required in French:
|
|
|
replacements = {
|
|
|
"l'": "l' ", # le/la
|
|
|
"c'": "c' ", # ce
|
|
|
"d'": "d' ", # de
|
|
|
"j'": "j' ", # je
|
|
|
"m'": "m' ", # me
|
|
|
"n'": "n' ", # ne
|
|
|
"qu'": "qu' ", # que
|
|
|
"s'": "s' ", # se
|
|
|
"t'": "t' ", # te
|
|
|
"jusqu'": "jusqu' ",
|
|
|
"lorsqu'": "lorsqu' ",
|
|
|
"puisqu'": "puisqu' ",
|
|
|
# Same rule for Unicode apostrophe, see also Parser.find_tokens():
|
|
|
r"(l|c|d|j|m|n|qu|s|t|jusqu|lorsqu|puisqu)’": "\\1’ "
|
|
|
}
|
|
|
replacements.update(((k.upper(), v.upper()) for k, v in list(replacements.items())))
|
|
|
|
|
|
|
|
|
def find_lemmata(tokens):
|
|
|
""" Annotates the tokens with lemmata for plural nouns and conjugated verbs,
|
|
|
where each token is a [word, part-of-speech] list.
|
|
|
"""
|
|
|
for token in tokens:
|
|
|
word, pos, lemma = token[0], token[1], token[0]
|
|
|
if pos.startswith(("DT", "PR", "WP")):
|
|
|
lemma = singularize(word, pos=pos)
|
|
|
if pos.startswith(("RB", "IN")) and (word.endswith(("'", "’")) or word == "du"):
|
|
|
lemma = singularize(word, pos=pos)
|
|
|
if pos.startswith(("JJ",)):
|
|
|
lemma = predicative(word)
|
|
|
if pos == "NNS":
|
|
|
lemma = singularize(word)
|
|
|
if pos.startswith(("VB", "MD")):
|
|
|
lemma = conjugate(word, INFINITIVE) or word
|
|
|
token.append(lemma.lower())
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
class Parser(_Parser):
|
|
|
|
|
|
def find_tokens(self, tokens, **kwargs):
|
|
|
kwargs.setdefault("abbreviations", ABBREVIATIONS)
|
|
|
kwargs.setdefault("replace", replacements)
|
|
|
s = _Parser.find_tokens(self, tokens, **kwargs)
|
|
|
s = [s.replace("&rsquo ;", "’") if isinstance(s, str) else s for s in s]
|
|
|
return s
|
|
|
|
|
|
def find_lemmata(self, tokens, **kwargs):
|
|
|
return find_lemmata(tokens)
|
|
|
|
|
|
def find_tags(self, tokens, **kwargs):
|
|
|
if kwargs.get("tagset") in (PENN, None):
|
|
|
kwargs.setdefault("map", lambda token, tag: (token, tag))
|
|
|
if kwargs.get("tagset") == UNIVERSAL:
|
|
|
kwargs.setdefault("map", lambda token, tag: penntreebank2universal(token, tag))
|
|
|
return _Parser.find_tags(self, tokens, **kwargs)
|
|
|
|
|
|
|
|
|
class Sentiment(_Sentiment):
|
|
|
|
|
|
def load(self, path=None):
|
|
|
_Sentiment.load(self, path)
|
|
|
# Map "précaire" to "precaire" (without diacritics, +1% accuracy).
|
|
|
if not path:
|
|
|
for w, pos in list(dict.items(self)):
|
|
|
w0 = w
|
|
|
if not w.endswith(("à", "è", "é", "ê", "ï")):
|
|
|
w = w.replace("à", "a")
|
|
|
w = w.replace("é", "e")
|
|
|
w = w.replace("è", "e")
|
|
|
w = w.replace("ê", "e")
|
|
|
w = w.replace("ï", "i")
|
|
|
if w != w0:
|
|
|
for pos, (p, s, i) in pos.items():
|
|
|
self.annotate(w, pos, p, s, i)
|
|
|
|
|
|
parser = Parser(
|
|
|
lexicon = os.path.join(MODULE, "fr-lexicon.txt"),
|
|
|
frequency = os.path.join(MODULE, "fr-frequency.txt"),
|
|
|
morphology = os.path.join(MODULE, "fr-morphology.txt"),
|
|
|
context = os.path.join(MODULE, "fr-context.txt"),
|
|
|
default = ("NN", "NNP", "CD"),
|
|
|
language = "fr"
|
|
|
)
|
|
|
|
|
|
lexicon = parser.lexicon # Expose lexicon.
|
|
|
|
|
|
sentiment = Sentiment(
|
|
|
path = os.path.join(MODULE, "fr-sentiment.xml"),
|
|
|
synset = None,
|
|
|
negations = ("n'", "ne", "ni", "non", "pas", "rien", "sans", "aucun", "jamais"),
|
|
|
modifiers = ("RB",),
|
|
|
modifier = lambda w: w.endswith("ment"),
|
|
|
tokenizer = parser.find_tokens,
|
|
|
language = "fr"
|
|
|
)
|
|
|
|
|
|
spelling = Spelling(
|
|
|
path = os.path.join(MODULE, "fr-spelling.txt")
|
|
|
)
|
|
|
|
|
|
|
|
|
def tokenize(s, *args, **kwargs):
|
|
|
""" Returns a list of sentences, where punctuation marks have been split from words.
|
|
|
"""
|
|
|
return parser.find_tokens(s, *args, **kwargs)
|
|
|
|
|
|
|
|
|
def parse(s, *args, **kwargs):
|
|
|
""" Returns a tagged Unicode string.
|
|
|
"""
|
|
|
return parser.parse(s, *args, **kwargs)
|
|
|
|
|
|
|
|
|
def parsetree(s, *args, **kwargs):
|
|
|
""" Returns a parsed Text from the given string.
|
|
|
"""
|
|
|
return Text(parse(s, *args, **kwargs))
|
|
|
|
|
|
|
|
|
def tree(s, token=[WORD, POS, CHUNK, PNP, REL, LEMMA]):
|
|
|
""" Returns a parsed Text from the given parsed string.
|
|
|
"""
|
|
|
return Text(s, token)
|
|
|
|
|
|
|
|
|
def tag(s, tokenize=True, encoding="utf-8", **kwargs):
|
|
|
""" Returns a list of (token, tag)-tuples from the given string.
|
|
|
"""
|
|
|
tags = []
|
|
|
for sentence in parse(s, tokenize, True, False, False, False, encoding, **kwargs).split():
|
|
|
for token in sentence:
|
|
|
tags.append((token[0], token[1]))
|
|
|
return tags
|
|
|
|
|
|
|
|
|
def keywords(s, top=10, **kwargs):
|
|
|
""" Returns a sorted list of keywords in the given string.
|
|
|
"""
|
|
|
return parser.find_keywords(s, **dict({
|
|
|
"frequency": parser.frequency,
|
|
|
"top": top,
|
|
|
"pos": ("NN",),
|
|
|
"ignore": ("rt",)}, **kwargs))
|
|
|
|
|
|
|
|
|
def suggest(w):
|
|
|
""" Returns a list of (word, confidence)-tuples of spelling corrections.
|
|
|
"""
|
|
|
return spelling.suggest(w)
|
|
|
|
|
|
|
|
|
def polarity(s, **kwargs):
|
|
|
""" Returns the sentence polarity (positive/negative) between -1.0 and 1.0.
|
|
|
"""
|
|
|
return sentiment(s, **kwargs)[0]
|
|
|
|
|
|
|
|
|
def subjectivity(s, **kwargs):
|
|
|
""" Returns the sentence subjectivity (objective/subjective) between 0.0 and 1.0.
|
|
|
"""
|
|
|
return sentiment(s, **kwargs)[1]
|
|
|
|
|
|
|
|
|
def positive(s, threshold=0.1, **kwargs):
|
|
|
""" Returns True if the given sentence has a positive sentiment (polarity >= threshold).
|
|
|
"""
|
|
|
return polarity(s, **kwargs) >= threshold
|
|
|
|
|
|
split = tree # Backwards compatibility.
|
|
|
|
|
|
#---------------------------------------------------------------------------------------------------
|
|
|
# python -m pattern.fr xml -s "C'est l'exception qui confirme la règle." -OTCL
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
commandline(parse)
|