| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- # -*- coding: utf-8 -*-
- """
- UDHR corpus reader. It mostly deals with encodings.
- """
- from nltk.corpus.reader.util import find_corpus_fileids
- from nltk.corpus.reader.plaintext import PlaintextCorpusReader
- class UdhrCorpusReader(PlaintextCorpusReader):
- ENCODINGS = [
- (".*-Latin1$", "latin-1"),
- (".*-Hebrew$", "hebrew"),
- (".*-Arabic$", "cp1256"),
- ("Czech_Cesky-UTF8", "cp1250"), # yeah
- (".*-Cyrillic$", "cyrillic"),
- (".*-SJIS$", "SJIS"),
- (".*-GB2312$", "GB2312"),
- (".*-Latin2$", "ISO-8859-2"),
- (".*-Greek$", "greek"),
- (".*-UTF8$", "utf-8"),
- ("Hungarian_Magyar-Unicode", "utf-16-le"),
- ("Amahuaca", "latin1"),
- ("Turkish_Turkce-Turkish", "latin5"),
- ("Lithuanian_Lietuviskai-Baltic", "latin4"),
- ("Japanese_Nihongo-EUC", "EUC-JP"),
- ("Japanese_Nihongo-JIS", "iso2022_jp"),
- ("Chinese_Mandarin-HZ", "hz"),
- ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
- ]
- SKIP = set(
- [
- # The following files are not fully decodable because they
- # were truncated at wrong bytes:
- "Burmese_Myanmar-UTF8",
- "Japanese_Nihongo-JIS",
- "Chinese_Mandarin-HZ",
- "Chinese_Mandarin-UTF8",
- "Gujarati-UTF8",
- "Hungarian_Magyar-Unicode",
- "Lao-UTF8",
- "Magahi-UTF8",
- "Marathi-UTF8",
- "Tamil-UTF8",
- # Unfortunately, encodings required for reading
- # the following files are not supported by Python:
- "Vietnamese-VPS",
- "Vietnamese-VIQR",
- "Vietnamese-TCVN",
- "Magahi-Agra",
- "Bhojpuri-Agra",
- "Esperanto-T61", # latin3 raises an exception
- # The following files are encoded for specific fonts:
- "Burmese_Myanmar-WinResearcher",
- "Armenian-DallakHelv",
- "Tigrinya_Tigrigna-VG2Main",
- "Amharic-Afenegus6..60375", # ?
- "Navaho_Dine-Navajo-Navaho-font",
- # What are these?
- "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
- "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
- # The following files are unintended:
- "Czech-Latin2-err",
- "Russian_Russky-UTF8~",
- ]
- )
- def __init__(self, root="udhr"):
- fileids = find_corpus_fileids(root, r"(?!README|\.).*")
- super(UdhrCorpusReader, self).__init__(
- root,
- [fileid for fileid in fileids if fileid not in self.SKIP],
- encoding=self.ENCODINGS,
- )
|