textcat.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Language ID module using TextCat algorithm
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Avital Pekker <avital.pekker@utoronto.ca>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. A module for language identification using the TextCat algorithm.
  11. An implementation of the text categorization algorithm
  12. presented in Cavnar, W. B. and J. M. Trenkle,
  13. "N-Gram-Based Text Categorization".
  14. The algorithm takes advantage of Zipf's law and uses
  15. n-gram frequencies to profile languages and text-yet to
  16. be identified-then compares using a distance measure.
  17. Language n-grams are provided by the "An Crubadan"
  18. project. A corpus reader was created separately to read
  19. those files.
  20. For details regarding the algorithm, see:
  21. http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
  22. For details about An Crubadan, see:
  23. http://borel.slu.edu/crubadan/index.html
  24. """
  25. from sys import maxsize
  26. from nltk.util import trigrams
  27. # Note: this is NOT "re" you're likely used to. The regex module
  28. # is an alternative to the standard re module that supports
  29. # Unicode codepoint properties with the \p{} syntax.
  30. # You may have to "pip install regx"
  31. try:
  32. import regex as re
  33. except ImportError:
  34. re = None
  35. ######################################################################
  36. ## Language identification using TextCat
  37. ######################################################################
  38. class TextCat(object):
  39. _corpus = None
  40. fingerprints = {}
  41. _START_CHAR = "<"
  42. _END_CHAR = ">"
  43. last_distances = {}
  44. def __init__(self):
  45. if not re:
  46. raise EnvironmentError(
  47. "classify.textcat requires the regex module that "
  48. "supports unicode. Try '$ pip install regex' and "
  49. "see https://pypi.python.org/pypi/regex for "
  50. "further details."
  51. )
  52. from nltk.corpus import crubadan
  53. self._corpus = crubadan
  54. # Load all language ngrams into cache
  55. for lang in self._corpus.langs():
  56. self._corpus.lang_freq(lang)
  57. def remove_punctuation(self, text):
  58. """ Get rid of punctuation except apostrophes """
  59. return re.sub(r"[^\P{P}\']+", "", text)
  60. def profile(self, text):
  61. """ Create FreqDist of trigrams within text """
  62. from nltk import word_tokenize, FreqDist
  63. clean_text = self.remove_punctuation(text)
  64. tokens = word_tokenize(clean_text)
  65. fingerprint = FreqDist()
  66. for t in tokens:
  67. token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
  68. token_trigrams = ["".join(tri) for tri in token_trigram_tuples]
  69. for cur_trigram in token_trigrams:
  70. if cur_trigram in fingerprint:
  71. fingerprint[cur_trigram] += 1
  72. else:
  73. fingerprint[cur_trigram] = 1
  74. return fingerprint
  75. def calc_dist(self, lang, trigram, text_profile):
  76. """ Calculate the "out-of-place" measure between the
  77. text and language profile for a single trigram """
  78. lang_fd = self._corpus.lang_freq(lang)
  79. dist = 0
  80. if trigram in lang_fd:
  81. idx_lang_profile = list(lang_fd.keys()).index(trigram)
  82. idx_text = list(text_profile.keys()).index(trigram)
  83. # print(idx_lang_profile, ", ", idx_text)
  84. dist = abs(idx_lang_profile - idx_text)
  85. else:
  86. # Arbitrary but should be larger than
  87. # any possible trigram file length
  88. # in terms of total lines
  89. dist = maxsize
  90. return dist
  91. def lang_dists(self, text):
  92. """ Calculate the "out-of-place" measure between
  93. the text and all languages """
  94. distances = {}
  95. profile = self.profile(text)
  96. # For all the languages
  97. for lang in self._corpus._all_lang_freq.keys():
  98. # Calculate distance metric for every trigram in
  99. # input text to be identified
  100. lang_dist = 0
  101. for trigram in profile:
  102. lang_dist += self.calc_dist(lang, trigram, profile)
  103. distances[lang] = lang_dist
  104. return distances
  105. def guess_language(self, text):
  106. """ Find the language with the min distance
  107. to the text and return its ISO 639-3 code """
  108. self.last_distances = self.lang_dists(text)
  109. return min(self.last_distances, key=self.last_distances.get)
  110. #################################################')
  111. def demo():
  112. from nltk.corpus import udhr
  113. langs = [
  114. "Kurdish-UTF8",
  115. "Abkhaz-UTF8",
  116. "Farsi_Persian-UTF8",
  117. "Hindi-UTF8",
  118. "Hawaiian-UTF8",
  119. "Russian-UTF8",
  120. "Vietnamese-UTF8",
  121. "Serbian_Srpski-UTF8",
  122. "Esperanto-UTF8",
  123. ]
  124. friendly = {
  125. "kmr": "Northern Kurdish",
  126. "abk": "Abkhazian",
  127. "pes": "Iranian Persian",
  128. "hin": "Hindi",
  129. "haw": "Hawaiian",
  130. "rus": "Russian",
  131. "vie": "Vietnamese",
  132. "srp": "Serbian",
  133. "epo": "Esperanto",
  134. }
  135. tc = TextCat()
  136. for cur_lang in langs:
  137. # Get raw data from UDHR corpus
  138. raw_sentences = udhr.sents(cur_lang)
  139. rows = len(raw_sentences) - 1
  140. cols = list(map(len, raw_sentences))
  141. sample = ""
  142. # Generate a sample text of the language
  143. for i in range(0, rows):
  144. cur_sent = ""
  145. for j in range(0, cols[i]):
  146. cur_sent += " " + raw_sentences[i][j]
  147. sample += cur_sent
  148. # Try to detect what it is
  149. print("Language snippet: " + sample[0:140] + "...")
  150. guess = tc.guess_language(sample)
  151. print("Language detection: %s (%s)" % (guess, friendly[guess]))
  152. print("#" * 140)
  153. if __name__ == "__main__":
  154. demo()