panlex_lite.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # Natural Language Toolkit: PanLex Corpus Reader
  2. #
  3. # Copyright (C) 2001-2020 NLTK Project
  4. # Author: David Kamholz <kamholz@panlex.org>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. CorpusReader for PanLex Lite, a stripped down version of PanLex distributed
  9. as an SQLite database. See the README.txt in the panlex_lite corpus directory
  10. for more information on PanLex Lite.
  11. """
  12. import os
  13. import sqlite3
  14. from nltk.corpus.reader.api import CorpusReader
  15. class PanLexLiteCorpusReader(CorpusReader):
  16. MEANING_Q = """
  17. SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv
  18. FROM dnx
  19. JOIN ex ON (ex.ex = dnx.ex)
  20. JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
  21. JOIN ex ex2 ON (ex2.ex = dnx2.ex)
  22. WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ?
  23. ORDER BY dnx2.uq DESC
  24. """
  25. TRANSLATION_Q = """
  26. SELECT s.tt, sum(s.uq) AS trq FROM (
  27. SELECT ex2.tt, max(dnx.uq) AS uq
  28. FROM dnx
  29. JOIN ex ON (ex.ex = dnx.ex)
  30. JOIN dnx dnx2 ON (dnx2.mn = dnx.mn)
  31. JOIN ex ex2 ON (ex2.ex = dnx2.ex)
  32. WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ?
  33. GROUP BY ex2.tt, dnx.ui
  34. ) s
  35. GROUP BY s.tt
  36. ORDER BY trq DESC, s.tt
  37. """
  38. def __init__(self, root):
  39. self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor()
  40. self._uid_lv = {}
  41. self._lv_uid = {}
  42. for row in self._c.execute("SELECT uid, lv FROM lv"):
  43. self._uid_lv[row[0]] = row[1]
  44. self._lv_uid[row[1]] = row[0]
  45. def language_varieties(self, lc=None):
  46. """
  47. Return a list of PanLex language varieties.
  48. :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties
  49. by this code. If unspecified, all varieties are returned.
  50. :return: the specified language varieties as a list of tuples. The first
  51. element is the language variety's seven-character uniform identifier,
  52. and the second element is its default name.
  53. :rtype: list(tuple)
  54. """
  55. if lc is None:
  56. return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall()
  57. else:
  58. return self._c.execute(
  59. "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,)
  60. ).fetchall()
  61. def meanings(self, expr_uid, expr_tt):
  62. """
  63. Return a list of meanings for an expression.
  64. :param expr_uid: the expression's language variety, as a seven-character
  65. uniform identifier.
  66. :param expr_tt: the expression's text.
  67. :return: a list of Meaning objects.
  68. :rtype: list(Meaning)
  69. """
  70. expr_lv = self._uid_lv[expr_uid]
  71. mn_info = {}
  72. for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)):
  73. mn = i[0]
  74. uid = self._lv_uid[i[5]]
  75. if not mn in mn_info:
  76. mn_info[mn] = {
  77. "uq": i[1],
  78. "ap": i[2],
  79. "ui": i[3],
  80. "ex": {expr_uid: [expr_tt]},
  81. }
  82. if not uid in mn_info[mn]["ex"]:
  83. mn_info[mn]["ex"][uid] = []
  84. mn_info[mn]["ex"][uid].append(i[4])
  85. return [Meaning(mn, mn_info[mn]) for mn in mn_info]
  86. def translations(self, from_uid, from_tt, to_uid):
  87. """
  88. Return a list of translations for an expression into a single language
  89. variety.
  90. :param from_uid: the source expression's language variety, as a
  91. seven-character uniform identifier.
  92. :param from_tt: the source expression's text.
  93. :param to_uid: the target language variety, as a seven-character
  94. uniform identifier.
  95. :return a list of translation tuples. The first element is the expression
  96. text and the second element is the translation quality.
  97. :rtype: list(tuple)
  98. """
  99. from_lv = self._uid_lv[from_uid]
  100. to_lv = self._uid_lv[to_uid]
  101. return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall()
  102. class Meaning(dict):
  103. """
  104. Represents a single PanLex meaning. A meaning is a translation set derived
  105. from a single source.
  106. """
  107. def __init__(self, mn, attr):
  108. super(Meaning, self).__init__(**attr)
  109. self["mn"] = mn
  110. def id(self):
  111. """
  112. :return: the meaning's id.
  113. :rtype: int
  114. """
  115. return self["mn"]
  116. def quality(self):
  117. """
  118. :return: the meaning's source's quality (0=worst, 9=best).
  119. :rtype: int
  120. """
  121. return self["uq"]
  122. def source(self):
  123. """
  124. :return: the meaning's source id.
  125. :rtype: int
  126. """
  127. return self["ap"]
  128. def source_group(self):
  129. """
  130. :return: the meaning's source group id.
  131. :rtype: int
  132. """
  133. return self["ui"]
  134. def expressions(self):
  135. """
  136. :return: the meaning's expressions as a dictionary whose keys are language
  137. variety uniform identifiers and whose values are lists of expression
  138. texts.
  139. :rtype: dict
  140. """
  141. return self["ex"]