udhr.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. # -*- coding: utf-8 -*-
  2. """
  3. UDHR corpus reader. It mostly deals with encodings.
  4. """
  5. from nltk.corpus.reader.util import find_corpus_fileids
  6. from nltk.corpus.reader.plaintext import PlaintextCorpusReader
  7. class UdhrCorpusReader(PlaintextCorpusReader):
  8. ENCODINGS = [
  9. (".*-Latin1$", "latin-1"),
  10. (".*-Hebrew$", "hebrew"),
  11. (".*-Arabic$", "cp1256"),
  12. ("Czech_Cesky-UTF8", "cp1250"), # yeah
  13. (".*-Cyrillic$", "cyrillic"),
  14. (".*-SJIS$", "SJIS"),
  15. (".*-GB2312$", "GB2312"),
  16. (".*-Latin2$", "ISO-8859-2"),
  17. (".*-Greek$", "greek"),
  18. (".*-UTF8$", "utf-8"),
  19. ("Hungarian_Magyar-Unicode", "utf-16-le"),
  20. ("Amahuaca", "latin1"),
  21. ("Turkish_Turkce-Turkish", "latin5"),
  22. ("Lithuanian_Lietuviskai-Baltic", "latin4"),
  23. ("Japanese_Nihongo-EUC", "EUC-JP"),
  24. ("Japanese_Nihongo-JIS", "iso2022_jp"),
  25. ("Chinese_Mandarin-HZ", "hz"),
  26. ("Abkhaz\-Cyrillic\+Abkh", "cp1251"),
  27. ]
  28. SKIP = set(
  29. [
  30. # The following files are not fully decodable because they
  31. # were truncated at wrong bytes:
  32. "Burmese_Myanmar-UTF8",
  33. "Japanese_Nihongo-JIS",
  34. "Chinese_Mandarin-HZ",
  35. "Chinese_Mandarin-UTF8",
  36. "Gujarati-UTF8",
  37. "Hungarian_Magyar-Unicode",
  38. "Lao-UTF8",
  39. "Magahi-UTF8",
  40. "Marathi-UTF8",
  41. "Tamil-UTF8",
  42. # Unfortunately, encodings required for reading
  43. # the following files are not supported by Python:
  44. "Vietnamese-VPS",
  45. "Vietnamese-VIQR",
  46. "Vietnamese-TCVN",
  47. "Magahi-Agra",
  48. "Bhojpuri-Agra",
  49. "Esperanto-T61", # latin3 raises an exception
  50. # The following files are encoded for specific fonts:
  51. "Burmese_Myanmar-WinResearcher",
  52. "Armenian-DallakHelv",
  53. "Tigrinya_Tigrigna-VG2Main",
  54. "Amharic-Afenegus6..60375", # ?
  55. "Navaho_Dine-Navajo-Navaho-font",
  56. # What are these?
  57. "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117",
  58. "Azeri_Azerbaijani_Latin-Az.Times.Lat0117",
  59. # The following files are unintended:
  60. "Czech-Latin2-err",
  61. "Russian_Russky-UTF8~",
  62. ]
  63. )
  64. def __init__(self, root="udhr"):
  65. fileids = find_corpus_fileids(root, r"(?!README|\.).*")
  66. super(UdhrCorpusReader, self).__init__(
  67. root,
  68. [fileid for fileid in fileids if fileid not in self.SKIP],
  69. encoding=self.ENCODINGS,
  70. )