bo-graduation/venv/lib/python3.7/site-packages/nltk/corpus/reader/udhr.py

# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""
from __future__ import absolute_import, unicode_literals

from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


class UdhrCorpusReader(PlaintextCorpusReader):

    ENCODINGS = [
        ('.*-Latin1$', 'latin-1'),
        ('.*-Hebrew$', 'hebrew'),
        ('.*-Arabic$', 'cp1256'),
        ('Czech_Cesky-UTF8', 'cp1250'),  # yeah
        ('.*-Cyrillic$', 'cyrillic'),
        ('.*-SJIS$', 'SJIS'),
        ('.*-GB2312$', 'GB2312'),
        ('.*-Latin2$', 'ISO-8859-2'),
        ('.*-Greek$', 'greek'),
        ('.*-UTF8$', 'utf-8'),
        ('Hungarian_Magyar-Unicode', 'utf-16-le'),
        ('Amahuaca', 'latin1'),
        ('Turkish_Turkce-Turkish', 'latin5'),
        ('Lithuanian_Lietuviskai-Baltic', 'latin4'),
        ('Japanese_Nihongo-EUC', 'EUC-JP'),
        ('Japanese_Nihongo-JIS', 'iso2022_jp'),
        ('Chinese_Mandarin-HZ', 'hz'),
        ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
    ]

    SKIP = set(
        [
            # The following files are not fully decodable because they
            # were truncated at wrong bytes:
            'Burmese_Myanmar-UTF8',
            'Japanese_Nihongo-JIS',
            'Chinese_Mandarin-HZ',
            'Chinese_Mandarin-UTF8',
            'Gujarati-UTF8',
            'Hungarian_Magyar-Unicode',
            'Lao-UTF8',
            'Magahi-UTF8',
            'Marathi-UTF8',
            'Tamil-UTF8',
            # Unfortunately, encodings required for reading
            # the following files are not supported by Python:
            'Vietnamese-VPS',
            'Vietnamese-VIQR',
            'Vietnamese-TCVN',
            'Magahi-Agra',
            'Bhojpuri-Agra',
            'Esperanto-T61',  # latin3 raises an exception
            # The following files are encoded for specific fonts:
            'Burmese_Myanmar-WinResearcher',
            'Armenian-DallakHelv',
            'Tigrinya_Tigrigna-VG2Main',
            'Amharic-Afenegus6..60375',  # ?
            'Navaho_Dine-Navajo-Navaho-font',
            # What are these?
            'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
            'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
            # The following files are unintended:
            'Czech-Latin2-err',
            'Russian_Russky-UTF8~',
        ]
    )

    def __init__(self, root='udhr'):
        fileids = find_corpus_fileids(root, r'(?!README|\.).*')
        super(UdhrCorpusReader, self).__init__(
            root,
            [fileid for fileid in fileids if fileid not in self.SKIP],
            encoding=self.ENCODINGS,
        )