|
|
# -*- coding: utf-8 -*-
|
|
|
# Natural Language Toolkit: Tokenizer Utilities
|
|
|
#
|
|
|
# Copyright (C) 2001-2020 NLTK Project
|
|
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
|
|
# URL: <http://nltk.sourceforge.net>
|
|
|
# For license information, see LICENSE.TXT
|
|
|
|
|
|
from re import finditer
|
|
|
from xml.sax.saxutils import escape, unescape
|
|
|
|
|
|
|
|
|
def string_span_tokenize(s, sep):
|
|
|
r"""
|
|
|
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
|
|
tuples, by splitting the string at each occurrence of *sep*.
|
|
|
|
|
|
>>> from nltk.tokenize.util import string_span_tokenize
|
|
|
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
|
|
... two of them.\n\nThanks.'''
|
|
|
>>> list(string_span_tokenize(s, " "))
|
|
|
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
|
|
|
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
|
|
|
|
|
|
:param s: the string to be tokenized
|
|
|
:type s: str
|
|
|
:param sep: the token separator
|
|
|
:type sep: str
|
|
|
:rtype: iter(tuple(int, int))
|
|
|
"""
|
|
|
if len(sep) == 0:
|
|
|
raise ValueError("Token delimiter must not be empty")
|
|
|
left = 0
|
|
|
while True:
|
|
|
try:
|
|
|
right = s.index(sep, left)
|
|
|
if right != 0:
|
|
|
yield left, right
|
|
|
except ValueError:
|
|
|
if left != len(s):
|
|
|
yield left, len(s)
|
|
|
break
|
|
|
|
|
|
left = right + len(sep)
|
|
|
|
|
|
|
|
|
def regexp_span_tokenize(s, regexp):
|
|
|
r"""
|
|
|
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
|
|
tuples, by splitting the string at each successive match of *regexp*.
|
|
|
|
|
|
>>> from nltk.tokenize.util import regexp_span_tokenize
|
|
|
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
|
|
... two of them.\n\nThanks.'''
|
|
|
>>> list(regexp_span_tokenize(s, r'\s'))
|
|
|
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
|
|
|
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
|
|
|
|
|
:param s: the string to be tokenized
|
|
|
:type s: str
|
|
|
:param regexp: regular expression that matches token separators (must not be empty)
|
|
|
:type regexp: str
|
|
|
:rtype: iter(tuple(int, int))
|
|
|
"""
|
|
|
left = 0
|
|
|
for m in finditer(regexp, s):
|
|
|
right, next = m.span()
|
|
|
if right != left:
|
|
|
yield left, right
|
|
|
left = next
|
|
|
yield left, len(s)
|
|
|
|
|
|
|
|
|
def spans_to_relative(spans):
|
|
|
r"""
|
|
|
Return a sequence of relative spans, given a sequence of spans.
|
|
|
|
|
|
>>> from nltk.tokenize import WhitespaceTokenizer
|
|
|
>>> from nltk.tokenize.util import spans_to_relative
|
|
|
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
|
|
... two of them.\n\nThanks.'''
|
|
|
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
|
|
|
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
|
|
|
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
|
|
|
|
|
|
:param spans: a sequence of (start, end) offsets of the tokens
|
|
|
:type spans: iter(tuple(int, int))
|
|
|
:rtype: iter(tuple(int, int))
|
|
|
"""
|
|
|
prev = 0
|
|
|
for left, right in spans:
|
|
|
yield left - prev, right - left
|
|
|
prev = right
|
|
|
|
|
|
|
|
|
class CJKChars(object):
|
|
|
"""
|
|
|
An object that enumerates the code points of the CJK characters as listed on
|
|
|
http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
|
|
|
|
|
|
This is a Python port of the CJK code point enumerations of Moses tokenizer:
|
|
|
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
|
|
|
"""
|
|
|
|
|
|
# Hangul Jamo (1100–11FF)
|
|
|
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
|
|
|
|
|
|
# CJK Radicals Supplement (2E80–2EFF)
|
|
|
# Kangxi Radicals (2F00–2FDF)
|
|
|
# Ideographic Description Characters (2FF0–2FFF)
|
|
|
# CJK Symbols and Punctuation (3000–303F)
|
|
|
# Hiragana (3040–309F)
|
|
|
# Katakana (30A0–30FF)
|
|
|
# Bopomofo (3100–312F)
|
|
|
# Hangul Compatibility Jamo (3130–318F)
|
|
|
# Kanbun (3190–319F)
|
|
|
# Bopomofo Extended (31A0–31BF)
|
|
|
# CJK Strokes (31C0–31EF)
|
|
|
# Katakana Phonetic Extensions (31F0–31FF)
|
|
|
# Enclosed CJK Letters and Months (3200–32FF)
|
|
|
# CJK Compatibility (3300–33FF)
|
|
|
# CJK Unified Ideographs Extension A (3400–4DBF)
|
|
|
# Yijing Hexagram Symbols (4DC0–4DFF)
|
|
|
# CJK Unified Ideographs (4E00–9FFF)
|
|
|
# Yi Syllables (A000–A48F)
|
|
|
# Yi Radicals (A490–A4CF)
|
|
|
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
|
|
|
|
|
|
# Phags-pa (A840–A87F)
|
|
|
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
|
|
|
|
|
|
# Hangul Syllables (AC00–D7AF)
|
|
|
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
|
|
|
|
|
|
# CJK Compatibility Ideographs (F900–FAFF)
|
|
|
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
|
|
|
|
|
|
# CJK Compatibility Forms (FE30–FE4F)
|
|
|
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
|
|
|
|
|
|
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
|
|
|
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
|
|
|
|
|
|
# Supplementary Ideographic Plane 20000–2FFFF
|
|
|
Supplementary_Ideographic_Plane = (
|
|
|
131072,
|
|
|
196607,
|
|
|
) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
|
|
|
|
|
|
ranges = [
|
|
|
Hangul_Jamo,
|
|
|
CJK_Radicals,
|
|
|
Phags_Pa,
|
|
|
Hangul_Syllables,
|
|
|
CJK_Compatibility_Ideographs,
|
|
|
CJK_Compatibility_Forms,
|
|
|
Katakana_Hangul_Halfwidth,
|
|
|
Supplementary_Ideographic_Plane,
|
|
|
]
|
|
|
|
|
|
|
|
|
def is_cjk(character):
|
|
|
"""
|
|
|
Python port of Moses' code to check for CJK character.
|
|
|
|
|
|
>>> CJKChars().ranges
|
|
|
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
|
|
|
>>> is_cjk(u'\u33fe')
|
|
|
True
|
|
|
>>> is_cjk(u'\uFE5F')
|
|
|
False
|
|
|
|
|
|
:param character: The character that needs to be checked.
|
|
|
:type character: char
|
|
|
:return: bool
|
|
|
"""
|
|
|
return any(
|
|
|
[
|
|
|
start <= ord(character) <= end
|
|
|
for start, end in [
|
|
|
(4352, 4607),
|
|
|
(11904, 42191),
|
|
|
(43072, 43135),
|
|
|
(44032, 55215),
|
|
|
(63744, 64255),
|
|
|
(65072, 65103),
|
|
|
(65381, 65500),
|
|
|
(131072, 196607),
|
|
|
]
|
|
|
]
|
|
|
)
|
|
|
|
|
|
|
|
|
def xml_escape(text):
|
|
|
"""
|
|
|
This function transforms the input text into an "escaped" version suitable
|
|
|
for well-formed XML formatting.
|
|
|
|
|
|
Note that the default xml.sax.saxutils.escape() function don't escape
|
|
|
some characters that Moses does so we have to manually add them to the
|
|
|
entities dictionary.
|
|
|
|
|
|
>>> input_str = ''')| & < > ' " ] ['''
|
|
|
>>> expected_output = ''')| & < > ' " ] ['''
|
|
|
>>> escape(input_str) == expected_output
|
|
|
True
|
|
|
>>> xml_escape(input_str)
|
|
|
')| & < > ' " ] ['
|
|
|
|
|
|
:param text: The text that needs to be escaped.
|
|
|
:type text: str
|
|
|
:rtype: str
|
|
|
"""
|
|
|
return escape(
|
|
|
text,
|
|
|
entities={
|
|
|
r"'": r"'",
|
|
|
r'"': r""",
|
|
|
r"|": r"|",
|
|
|
r"[": r"[",
|
|
|
r"]": r"]",
|
|
|
},
|
|
|
)
|
|
|
|
|
|
|
|
|
def xml_unescape(text):
|
|
|
"""
|
|
|
This function transforms the "escaped" version suitable
|
|
|
for well-formed XML formatting into humanly-readable string.
|
|
|
|
|
|
Note that the default xml.sax.saxutils.unescape() function don't unescape
|
|
|
some characters that Moses does so we have to manually add them to the
|
|
|
entities dictionary.
|
|
|
|
|
|
>>> from xml.sax.saxutils import unescape
|
|
|
>>> s = ')| & < > ' " ] ['
|
|
|
>>> expected = ''')| & < > \' " ] ['''
|
|
|
>>> xml_unescape(s) == expected
|
|
|
True
|
|
|
|
|
|
:param text: The text that needs to be unescaped.
|
|
|
:type text: str
|
|
|
:rtype: str
|
|
|
"""
|
|
|
return unescape(
|
|
|
text,
|
|
|
entities={
|
|
|
r"'": r"'",
|
|
|
r""": r'"',
|
|
|
r"|": r"|",
|
|
|
r"[": r"[",
|
|
|
r"]": r"]",
|
|
|
},
|
|
|
)
|
|
|
|
|
|
|
|
|
def align_tokens(tokens, sentence):
|
|
|
"""
|
|
|
This module attempt to find the offsets of the tokens in *s*, as a sequence
|
|
|
of ``(start, end)`` tuples, given the tokens and also the source string.
|
|
|
|
|
|
>>> from nltk.tokenize import TreebankWordTokenizer
|
|
|
>>> from nltk.tokenize.util import align_tokens
|
|
|
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
|
|
|
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
|
|
|
... "on Saturday.")
|
|
|
>>> tokens = TreebankWordTokenizer().tokenize(s)
|
|
|
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
|
|
|
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
|
|
|
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
|
|
|
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
|
|
|
... (123, 131), (131, 132)]
|
|
|
>>> output = list(align_tokens(tokens, s))
|
|
|
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
|
|
|
True
|
|
|
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
|
|
|
True
|
|
|
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
|
|
|
True
|
|
|
|
|
|
:param tokens: The list of strings that are the result of tokenization
|
|
|
:type tokens: list(str)
|
|
|
:param sentence: The original string
|
|
|
:type sentence: str
|
|
|
:rtype: list(tuple(int,int))
|
|
|
"""
|
|
|
point = 0
|
|
|
offsets = []
|
|
|
for token in tokens:
|
|
|
try:
|
|
|
start = sentence.index(token, point)
|
|
|
except ValueError:
|
|
|
raise ValueError('substring "{}" not found in "{}"'.format(token, sentence))
|
|
|
point = start + len(token)
|
|
|
offsets.append((start, point))
|
|
|
return offsets
|