crubadan.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: An Crubadan N-grams Reader
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Avital Pekker <avital.pekker@utoronto.ca>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. An NLTK interface for the n-gram statistics gathered from
  11. the corpora for each language using An Crubadan.
  12. There are multiple potential applications for the data but
  13. this reader was created with the goal of using it in the
  14. context of language identification.
  15. For details about An Crubadan, this data, and its potential uses, see:
  16. http://borel.slu.edu/crubadan/index.html
  17. """
  18. import re
  19. from os import path
  20. from nltk.corpus.reader import CorpusReader
  21. from nltk.probability import FreqDist
  22. from nltk.data import ZipFilePathPointer
  23. class CrubadanCorpusReader(CorpusReader):
  24. """
  25. A corpus reader used to access language An Crubadan n-gram files.
  26. """
  27. _LANG_MAPPER_FILE = "table.txt"
  28. _all_lang_freq = {}
  29. def __init__(self, root, fileids, encoding="utf8", tagset=None):
  30. super(CrubadanCorpusReader, self).__init__(root, fileids, encoding="utf8")
  31. self._lang_mapping_data = []
  32. self._load_lang_mapping_data()
  33. def lang_freq(self, lang):
  34. """ Return n-gram FreqDist for a specific language
  35. given ISO 639-3 language code """
  36. if lang not in self._all_lang_freq:
  37. self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
  38. return self._all_lang_freq[lang]
  39. def langs(self):
  40. """ Return a list of supported languages as ISO 639-3 codes """
  41. return [row[1] for row in self._lang_mapping_data]
  42. def iso_to_crubadan(self, lang):
  43. """ Return internal Crubadan code based on ISO 639-3 code """
  44. for i in self._lang_mapping_data:
  45. if i[1].lower() == lang.lower():
  46. return i[0]
  47. def crubadan_to_iso(self, lang):
  48. """ Return ISO 639-3 code given internal Crubadan code """
  49. for i in self._lang_mapping_data:
  50. if i[0].lower() == lang.lower():
  51. return i[1]
  52. def _load_lang_mapping_data(self):
  53. """ Load language mappings between codes and description from table.txt """
  54. if isinstance(self.root, ZipFilePathPointer):
  55. raise RuntimeError(
  56. "Please install the 'crubadan' corpus first, use nltk.download()"
  57. )
  58. mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
  59. if self._LANG_MAPPER_FILE not in self.fileids():
  60. raise RuntimeError("Could not find language mapper file: " + mapper_file)
  61. raw = open(mapper_file, "r", encoding="utf-8").read().strip()
  62. self._lang_mapping_data = [row.split("\t") for row in raw.split("\n")]
  63. def _load_lang_ngrams(self, lang):
  64. """ Load single n-gram language file given the ISO 639-3 language code
  65. and return its FreqDist """
  66. if lang not in self.langs():
  67. raise RuntimeError("Unsupported language.")
  68. crubadan_code = self.iso_to_crubadan(lang)
  69. ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
  70. if not path.isfile(ngram_file):
  71. raise RuntimeError("No N-gram file found for requested language.")
  72. counts = FreqDist()
  73. f = open(ngram_file, "r", encoding="utf-8")
  74. for line in f:
  75. data = line.split(" ")
  76. ngram = data[1].strip("\n")
  77. freq = int(data[0])
  78. counts[ngram] = freq
  79. return counts