You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
209 lines
6.0 KiB
Python
209 lines
6.0 KiB
Python
5 years ago
|
# -*- coding: utf-8 -*-
|
||
|
# Natural Language Toolkit: Language ID module using TextCat algorithm
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
||
|
#
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A module for language identification using the TextCat algorithm.
|
||
|
An implementation of the text categorization algorithm
|
||
|
presented in Cavnar, W. B. and J. M. Trenkle,
|
||
|
"N-Gram-Based Text Categorization".
|
||
|
|
||
|
The algorithm takes advantage of Zipf's law and uses
|
||
|
n-gram frequencies to profile languages and text-yet to
|
||
|
be identified-then compares using a distance measure.
|
||
|
|
||
|
Language n-grams are provided by the "An Crubadan"
|
||
|
project. A corpus reader was created separately to read
|
||
|
those files.
|
||
|
|
||
|
For details regarding the algorithm, see:
|
||
|
http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
|
||
|
|
||
|
For details about An Crubadan, see:
|
||
|
http://borel.slu.edu/crubadan/index.html
|
||
|
"""
|
||
|
|
||
|
# Ensure that literal strings default to unicode rather than str.
|
||
|
from __future__ import print_function, unicode_literals
|
||
|
|
||
|
from nltk.compat import PY3
|
||
|
from nltk.util import trigrams
|
||
|
|
||
|
if PY3:
|
||
|
from sys import maxsize
|
||
|
else:
|
||
|
from sys import maxint
|
||
|
|
||
|
# Note: this is NOT "re" you're likely used to. The regex module
|
||
|
# is an alternative to the standard re module that supports
|
||
|
# Unicode codepoint properties with the \p{} syntax.
|
||
|
# You may have to "pip install regx"
|
||
|
try:
|
||
|
import regex as re
|
||
|
except ImportError:
|
||
|
re = None
|
||
|
######################################################################
|
||
|
## Language identification using TextCat
|
||
|
######################################################################
|
||
|
|
||
|
|
||
|
class TextCat(object):
|
||
|
|
||
|
_corpus = None
|
||
|
fingerprints = {}
|
||
|
_START_CHAR = "<"
|
||
|
_END_CHAR = ">"
|
||
|
|
||
|
last_distances = {}
|
||
|
|
||
|
def __init__(self):
|
||
|
if not re:
|
||
|
raise EnvironmentError(
|
||
|
"classify.textcat requires the regex module that "
|
||
|
"supports unicode. Try '$ pip install regex' and "
|
||
|
"see https://pypi.python.org/pypi/regex for "
|
||
|
"further details."
|
||
|
)
|
||
|
|
||
|
from nltk.corpus import crubadan
|
||
|
|
||
|
self._corpus = crubadan
|
||
|
# Load all language ngrams into cache
|
||
|
for lang in self._corpus.langs():
|
||
|
self._corpus.lang_freq(lang)
|
||
|
|
||
|
def remove_punctuation(self, text):
|
||
|
''' Get rid of punctuation except apostrophes '''
|
||
|
return re.sub(r"[^\P{P}\']+", "", text)
|
||
|
|
||
|
def profile(self, text):
|
||
|
''' Create FreqDist of trigrams within text '''
|
||
|
from nltk import word_tokenize, FreqDist
|
||
|
|
||
|
clean_text = self.remove_punctuation(text)
|
||
|
tokens = word_tokenize(clean_text)
|
||
|
|
||
|
fingerprint = FreqDist()
|
||
|
for t in tokens:
|
||
|
token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
|
||
|
token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
|
||
|
|
||
|
for cur_trigram in token_trigrams:
|
||
|
if cur_trigram in fingerprint:
|
||
|
fingerprint[cur_trigram] += 1
|
||
|
else:
|
||
|
fingerprint[cur_trigram] = 1
|
||
|
|
||
|
return fingerprint
|
||
|
|
||
|
def calc_dist(self, lang, trigram, text_profile):
|
||
|
''' Calculate the "out-of-place" measure between the
|
||
|
text and language profile for a single trigram '''
|
||
|
|
||
|
lang_fd = self._corpus.lang_freq(lang)
|
||
|
dist = 0
|
||
|
|
||
|
if trigram in lang_fd:
|
||
|
idx_lang_profile = list(lang_fd.keys()).index(trigram)
|
||
|
idx_text = list(text_profile.keys()).index(trigram)
|
||
|
|
||
|
# print(idx_lang_profile, ", ", idx_text)
|
||
|
dist = abs(idx_lang_profile - idx_text)
|
||
|
else:
|
||
|
# Arbitrary but should be larger than
|
||
|
# any possible trigram file length
|
||
|
# in terms of total lines
|
||
|
if PY3:
|
||
|
dist = maxsize
|
||
|
else:
|
||
|
dist = maxint
|
||
|
|
||
|
return dist
|
||
|
|
||
|
def lang_dists(self, text):
|
||
|
''' Calculate the "out-of-place" measure between
|
||
|
the text and all languages '''
|
||
|
|
||
|
distances = {}
|
||
|
profile = self.profile(text)
|
||
|
# For all the languages
|
||
|
for lang in self._corpus._all_lang_freq.keys():
|
||
|
# Calculate distance metric for every trigram in
|
||
|
# input text to be identified
|
||
|
lang_dist = 0
|
||
|
for trigram in profile:
|
||
|
lang_dist += self.calc_dist(lang, trigram, profile)
|
||
|
|
||
|
distances[lang] = lang_dist
|
||
|
|
||
|
return distances
|
||
|
|
||
|
def guess_language(self, text):
|
||
|
''' Find the language with the min distance
|
||
|
to the text and return its ISO 639-3 code '''
|
||
|
self.last_distances = self.lang_dists(text)
|
||
|
|
||
|
return min(self.last_distances, key=self.last_distances.get)
|
||
|
#################################################')
|
||
|
|
||
|
|
||
|
def demo():
|
||
|
from nltk.corpus import udhr
|
||
|
|
||
|
langs = [
|
||
|
'Kurdish-UTF8',
|
||
|
'Abkhaz-UTF8',
|
||
|
'Farsi_Persian-UTF8',
|
||
|
'Hindi-UTF8',
|
||
|
'Hawaiian-UTF8',
|
||
|
'Russian-UTF8',
|
||
|
'Vietnamese-UTF8',
|
||
|
'Serbian_Srpski-UTF8',
|
||
|
'Esperanto-UTF8',
|
||
|
]
|
||
|
|
||
|
friendly = {
|
||
|
'kmr': 'Northern Kurdish',
|
||
|
'abk': 'Abkhazian',
|
||
|
'pes': 'Iranian Persian',
|
||
|
'hin': 'Hindi',
|
||
|
'haw': 'Hawaiian',
|
||
|
'rus': 'Russian',
|
||
|
'vie': 'Vietnamese',
|
||
|
'srp': 'Serbian',
|
||
|
'epo': 'Esperanto',
|
||
|
}
|
||
|
|
||
|
tc = TextCat()
|
||
|
|
||
|
for cur_lang in langs:
|
||
|
# Get raw data from UDHR corpus
|
||
|
raw_sentences = udhr.sents(cur_lang)
|
||
|
rows = len(raw_sentences) - 1
|
||
|
cols = list(map(len, raw_sentences))
|
||
|
|
||
|
sample = ''
|
||
|
|
||
|
# Generate a sample text of the language
|
||
|
for i in range(0, rows):
|
||
|
cur_sent = ''
|
||
|
for j in range(0, cols[i]):
|
||
|
cur_sent += ' ' + raw_sentences[i][j]
|
||
|
|
||
|
sample += cur_sent
|
||
|
|
||
|
# Try to detect what it is
|
||
|
print('Language snippet: ' + sample[0:140] + '...')
|
||
|
guess = tc.guess_language(sample)
|
||
|
print('Language detection: %s (%s)' % (guess, friendly[guess]))
|
||
|
print('#' * 140)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
demo()
|