gbrault
/
jupytersketcher
zrkadlo https://github.com/gbrault/jupytersketcher.git


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
							# -*- coding: utf-8 -*-
"""
UDHR corpus reader. It mostly deals with encodings.
"""

from nltk.corpus.reader.util import find_corpus_fileids
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


class UdhrCorpusReader(PlaintextCorpusReader):

    ENCODINGS = [
        (".*-Latin1$", "latin-1"),
        (".*-Hebrew$", "hebrew"),
        (".*-Arabic$", "cp1256"),
        ("Czech_Cesky-UTF8", "cp1250"),  # yeah
        (".*-Cyrillic$", "cyrillic"),
        (".*-SJIS$", "SJIS"),
        (".*-GB2312$", "GB2312"),
        (".*-Latin2$", "ISO-8859-2"),
        (".*-Greek$", "greek"),
        (".*-UTF8$", "utf-8"),
        ("Hungarian_Magyar-Unicode", "utf-16-le"),
        ("Amahuaca", "latin1"),
        ("Turkish_Turkce-Turkish", "latin5"),
        ("Lithuanian_Lietuviskai-Baltic", "latin4"),
        ("Japanese_Nihongo-EUC", "EUC-JP"),
        ("Japanese_Nihongo-JIS", "iso2022_jp"),
        ("Chinese_Mandarin-HZ", "hz"),
        ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
    ]

    SKIP = set(
        [
            # The following files are not fully decodable because they
            # were truncated at wrong bytes:
            "Burmese_Myanmar-UTF8",
            "Japanese_Nihongo-JIS",
            "Chinese_Mandarin-HZ",
            "Chinese_Mandarin-UTF8",
            "Gujarati-UTF8",
            "Hungarian_Magyar-Unicode",
            "Lao-UTF8",
            "Magahi-UTF8",
            "Marathi-UTF8",
            "Tamil-UTF8",
            # Unfortunately, encodings required for reading
            # the following files are not supported by Python:
            "Vietnamese-VPS",
            "Vietnamese-VIQR",
            "Vietnamese-TCVN",
            "Magahi-Agra",
            "Bhojpuri-Agra",
            "Esperanto-T61",  # latin3 raises an exception
            # The following files are encoded for specific fonts:
            "Burmese_Myanmar-WinResearcher",
            "Armenian-DallakHelv",
            "Tigrinya_Tigrigna-VG2Main",
            "Amharic-Afenegus6..60375",  # ?
            "Navaho_Dine-Navajo-Navaho-font",
            # What are these?
            "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
            "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
            # The following files are unintended:
            "Czech-Latin2-err",
            "Russian_Russky-UTF8~",
        ]
    )

    def __init__(self, root="udhr"):
        fileids = find_corpus_fileids(root, r"(?!README|\.).*")
        super(UdhrCorpusReader, self).__init__(
            root,
            [fileid for fileid in fileids if fileid not in self.SKIP],
            encoding=self.ENCODINGS,
        )