gale_church.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Gale-Church Aligner
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Author: Torsten Marek <marek@ifi.uzh.ch>
  6. # Contributor: Cassidy Laidlaw, Liling Tan
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. A port of the Gale-Church Aligner.
  11. Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
  12. http://aclweb.org/anthology/J93-1004.pdf
  13. """
  14. import math
  15. try:
  16. from scipy.stats import norm
  17. from norm import logsf as norm_logsf
  18. except ImportError:
  19. def erfcc(x):
  20. """Complementary error function."""
  21. z = abs(x)
  22. t = 1 / (1 + 0.5 * z)
  23. r = t * math.exp(
  24. -z * z
  25. - 1.26551223
  26. + t
  27. * (
  28. 1.00002368
  29. + t
  30. * (
  31. 0.37409196
  32. + t
  33. * (
  34. 0.09678418
  35. + t
  36. * (
  37. -0.18628806
  38. + t
  39. * (
  40. 0.27886807
  41. + t
  42. * (
  43. -1.13520398
  44. + t
  45. * (1.48851587 + t * (-0.82215223 + t * 0.17087277))
  46. )
  47. )
  48. )
  49. )
  50. )
  51. )
  52. )
  53. if x >= 0.0:
  54. return r
  55. else:
  56. return 2.0 - r
  57. def norm_cdf(x):
  58. """Return the area under the normal distribution from M{-∞..x}."""
  59. return 1 - 0.5 * erfcc(x / math.sqrt(2))
  60. def norm_logsf(x):
  61. try:
  62. return math.log(1 - norm_cdf(x))
  63. except ValueError:
  64. return float("-inf")
  65. LOG2 = math.log(2)
  66. class LanguageIndependent(object):
  67. # These are the language-independent probabilities and parameters
  68. # given in Gale & Church
  69. # for the computation, l_1 is always the language with less characters
  70. PRIORS = {
  71. (1, 0): 0.0099,
  72. (0, 1): 0.0099,
  73. (1, 1): 0.89,
  74. (2, 1): 0.089,
  75. (1, 2): 0.089,
  76. (2, 2): 0.011,
  77. }
  78. AVERAGE_CHARACTERS = 1
  79. VARIANCE_CHARACTERS = 6.8
  80. def trace(backlinks, source_sents_lens, target_sents_lens):
  81. """
  82. Traverse the alignment cost from the tracebacks and retrieves
  83. appropriate sentence pairs.
  84. :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
  85. :type backlinks: dict
  86. :param source_sents_lens: A list of target sentences' lengths
  87. :type source_sents_lens: list(int)
  88. :param target_sents_lens: A list of target sentences' lengths
  89. :type target_sents_lens: list(int)
  90. """
  91. links = []
  92. position = (len(source_sents_lens), len(target_sents_lens))
  93. while position != (0, 0) and all(p >= 0 for p in position):
  94. try:
  95. s, t = backlinks[position]
  96. except TypeError:
  97. position = (position[0] - 1, position[1] - 1)
  98. continue
  99. for i in range(s):
  100. for j in range(t):
  101. links.append((position[0] - i - 1, position[1] - j - 1))
  102. position = (position[0] - s, position[1] - t)
  103. return links[::-1]
  104. def align_log_prob(i, j, source_sents, target_sents, alignment, params):
  105. """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
  106. being aligned with a specific C{alignment}.
  107. @param i: The offset of the source sentence.
  108. @param j: The offset of the target sentence.
  109. @param source_sents: The list of source sentence lengths.
  110. @param target_sents: The list of target sentence lengths.
  111. @param alignment: The alignment type, a tuple of two integers.
  112. @param params: The sentence alignment parameters.
  113. @returns: The log probability of a specific alignment between the two sentences, given the parameters.
  114. """
  115. l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
  116. l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
  117. try:
  118. # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
  119. # reference implementation. With l_s in the denominator, insertions are impossible.
  120. m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
  121. delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
  122. m * params.VARIANCE_CHARACTERS
  123. )
  124. except ZeroDivisionError:
  125. return float("-inf")
  126. return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
  127. def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
  128. """Return the sentence alignment of two text blocks (usually paragraphs).
  129. >>> align_blocks([5,5,5], [7,7,7])
  130. [(0, 0), (1, 1), (2, 2)]
  131. >>> align_blocks([10,5,5], [12,20])
  132. [(0, 0), (1, 1), (2, 1)]
  133. >>> align_blocks([12,20], [10,5,5])
  134. [(0, 0), (1, 1), (1, 2)]
  135. >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
  136. [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
  137. @param source_sents_lens: The list of source sentence lengths.
  138. @param target_sents_lens: The list of target sentence lengths.
  139. @param params: the sentence alignment parameters.
  140. @return: The sentence alignments, a list of index pairs.
  141. """
  142. alignment_types = list(params.PRIORS.keys())
  143. # there are always three rows in the history (with the last of them being filled)
  144. D = [[]]
  145. backlinks = {}
  146. for i in range(len(source_sents_lens) + 1):
  147. for j in range(len(target_sents_lens) + 1):
  148. min_dist = float("inf")
  149. min_align = None
  150. for a in alignment_types:
  151. prev_i = -1 - a[0]
  152. prev_j = j - a[1]
  153. if prev_i < -len(D) or prev_j < 0:
  154. continue
  155. p = D[prev_i][prev_j] + align_log_prob(
  156. i, j, source_sents_lens, target_sents_lens, a, params
  157. )
  158. if p < min_dist:
  159. min_dist = p
  160. min_align = a
  161. if min_dist == float("inf"):
  162. min_dist = 0
  163. backlinks[(i, j)] = min_align
  164. D[-1].append(min_dist)
  165. if len(D) > 2:
  166. D.pop(0)
  167. D.append([])
  168. return trace(backlinks, source_sents_lens, target_sents_lens)
  169. def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
  170. """Creates the sentence alignment of two texts.
  171. Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
  172. alignment links.
  173. Each block consists of a list that contains the lengths (in characters) of the sentences
  174. in this block.
  175. @param source_blocks: The list of blocks in the source text.
  176. @param target_blocks: The list of blocks in the target text.
  177. @param params: the sentence alignment parameters.
  178. @returns: A list of sentence alignment lists
  179. """
  180. if len(source_blocks) != len(target_blocks):
  181. raise ValueError(
  182. "Source and target texts do not have the same number of blocks."
  183. )
  184. return [
  185. align_blocks(source_block, target_block, params)
  186. for source_block, target_block in zip(source_blocks, target_blocks)
  187. ]
  188. # File I/O functions; may belong in a corpus reader
  189. def split_at(it, split_value):
  190. """Splits an iterator C{it} at values of C{split_value}.
  191. Each instance of C{split_value} is swallowed. The iterator produces
  192. subiterators which need to be consumed fully before the next subiterator
  193. can be used.
  194. """
  195. def _chunk_iterator(first):
  196. v = first
  197. while v != split_value:
  198. yield v
  199. v = it.next()
  200. while True:
  201. yield _chunk_iterator(it.next())
  202. def parse_token_stream(stream, soft_delimiter, hard_delimiter):
  203. """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
  204. and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
  205. """
  206. return [
  207. [
  208. sum(len(token) for token in sentence_it)
  209. for sentence_it in split_at(block_it, soft_delimiter)
  210. ]
  211. for block_it in split_at(stream, hard_delimiter)
  212. ]