You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.6 KiB
Python

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from __future__ import absolute_import, unicode_literals
from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
class UdhrCorpusReader(PlaintextCorpusReader):
ENCODINGS = [
('.*-Latin1$', 'latin-1'),
('.*-Hebrew$', 'hebrew'),
('.*-Arabic$', 'cp1256'),
('Czech_Cesky-UTF8', 'cp1250'), # yeah
('.*-Cyrillic$', 'cyrillic'),
('.*-SJIS$', 'SJIS'),
('.*-GB2312$', 'GB2312'),
('.*-Latin2$', 'ISO-8859-2'),
('.*-Greek$', 'greek'),
('.*-UTF8$', 'utf-8'),
('Hungarian_Magyar-Unicode', 'utf-16-le'),
('Amahuaca', 'latin1'),
('Turkish_Turkce-Turkish', 'latin5'),
('Lithuanian_Lietuviskai-Baltic', 'latin4'),
('Japanese_Nihongo-EUC', 'EUC-JP'),
('Japanese_Nihongo-JIS', 'iso2022_jp'),
('Chinese_Mandarin-HZ', 'hz'),
('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
]
SKIP = set(
[
# The following files are not fully decodable because they
# were truncated at wrong bytes:
'Burmese_Myanmar-UTF8',
'Japanese_Nihongo-JIS',
'Chinese_Mandarin-HZ',
'Chinese_Mandarin-UTF8',
'Gujarati-UTF8',
'Hungarian_Magyar-Unicode',
'Lao-UTF8',
'Magahi-UTF8',
'Marathi-UTF8',
'Tamil-UTF8',
# Unfortunately, encodings required for reading
# the following files are not supported by Python:
'Vietnamese-VPS',
'Vietnamese-VIQR',
'Vietnamese-TCVN',
'Magahi-Agra',
'Bhojpuri-Agra',
'Esperanto-T61', # latin3 raises an exception
# The following files are encoded for specific fonts:
'Burmese_Myanmar-WinResearcher',
'Armenian-DallakHelv',
'Tigrinya_Tigrigna-VG2Main',
'Amharic-Afenegus6..60375', # ?
'Navaho_Dine-Navajo-Navaho-font',
# What are these?
'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
# The following files are unintended:
'Czech-Latin2-err',
'Russian_Russky-UTF8~',
]
)
def __init__(self, root='udhr'):
fileids = find_corpus_fileids(root, r'(?!README|\.).*')
super(UdhrCorpusReader, self).__init__(
root,
[fileid for fileid in fileids if fileid not in self.SKIP],
encoding=self.ENCODINGS,
)