ibm1.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: IBM Model 1
  3. #
  4. # Copyright (C) 2001-2013 NLTK Project
  5. # Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
  6. # Hengfeng Li <hengfeng12345@gmail.com>
  7. # Ruxin Hou <r.hou@student.unimelb.edu.au>
  8. # Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
  9. # Based on earlier version by:
  10. # Will Zhang <wilzzha@gmail.com>
  11. # Guan Gui <ggui@student.unimelb.edu.au>
  12. # URL: <http://nltk.org/>
  13. # For license information, see LICENSE.TXT
  14. """
  15. Lexical translation model that ignores word order.
  16. In IBM Model 1, word order is ignored for simplicity. As long as the
  17. word alignments are equivalent, it doesn't matter where the word occurs
  18. in the source or target sentence. Thus, the following three alignments
  19. are equally likely.
  20. Source: je mange du jambon
  21. Target: i eat some ham
  22. Alignment: (0,0) (1,1) (2,2) (3,3)
  23. Source: je mange du jambon
  24. Target: some ham eat i
  25. Alignment: (0,2) (1,3) (2,1) (3,1)
  26. Source: du jambon je mange
  27. Target: eat i some ham
  28. Alignment: (0,3) (1,2) (2,0) (3,1)
  29. Note that an alignment is represented here as
  30. (word_index_in_target, word_index_in_source).
  31. The EM algorithm used in Model 1 is:
  32. E step - In the training data, count how many times a source language
  33. word is translated into a target language word, weighted by
  34. the prior probability of the translation.
  35. M step - Estimate the new probability of translation based on the
  36. counts from the Expectation step.
  37. Notations:
  38. i: Position in the source sentence
  39. Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
  40. j: Position in the target sentence
  41. Valid values are 1, 2, ..., length of target sentence
  42. s: A word in the source language
  43. t: A word in the target language
  44. References:
  45. Philipp Koehn. 2010. Statistical Machine Translation.
  46. Cambridge University Press, New York.
  47. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
  48. Robert L. Mercer. 1993. The Mathematics of Statistical Machine
  49. Translation: Parameter Estimation. Computational Linguistics, 19 (2),
  50. 263-311.
  51. """
  52. from collections import defaultdict
  53. from nltk.translate import AlignedSent
  54. from nltk.translate import Alignment
  55. from nltk.translate import IBMModel
  56. from nltk.translate.ibm_model import Counts
  57. import warnings
  58. class IBMModel1(IBMModel):
  59. """
  60. Lexical translation model that ignores word order
  61. >>> bitext = []
  62. >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
  63. >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
  64. >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
  65. >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
  66. >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
  67. >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
  68. >>> ibm1 = IBMModel1(bitext, 5)
  69. >>> print(ibm1.translation_table['buch']['book'])
  70. 0.889...
  71. >>> print(ibm1.translation_table['das']['book'])
  72. 0.061...
  73. >>> print(ibm1.translation_table['buch'][None])
  74. 0.113...
  75. >>> print(ibm1.translation_table['ja'][None])
  76. 0.072...
  77. >>> test_sentence = bitext[2]
  78. >>> test_sentence.words
  79. ['das', 'buch', 'ist', 'ja', 'klein']
  80. >>> test_sentence.mots
  81. ['the', 'book', 'is', 'small']
  82. >>> test_sentence.alignment
  83. Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
  84. """
  85. def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
  86. """
  87. Train on ``sentence_aligned_corpus`` and create a lexical
  88. translation model.
  89. Translation direction is from ``AlignedSent.mots`` to
  90. ``AlignedSent.words``.
  91. :param sentence_aligned_corpus: Sentence-aligned parallel corpus
  92. :type sentence_aligned_corpus: list(AlignedSent)
  93. :param iterations: Number of iterations to run training algorithm
  94. :type iterations: int
  95. :param probability_tables: Optional. Use this to pass in custom
  96. probability values. If not specified, probabilities will be
  97. set to a uniform distribution, or some other sensible value.
  98. If specified, the following entry must be present:
  99. ``translation_table``.
  100. See ``IBMModel`` for the type and purpose of this table.
  101. :type probability_tables: dict[str]: object
  102. """
  103. super(IBMModel1, self).__init__(sentence_aligned_corpus)
  104. if probability_tables is None:
  105. self.set_uniform_probabilities(sentence_aligned_corpus)
  106. else:
  107. # Set user-defined probabilities
  108. self.translation_table = probability_tables["translation_table"]
  109. for n in range(0, iterations):
  110. self.train(sentence_aligned_corpus)
  111. self.align_all(sentence_aligned_corpus)
  112. def set_uniform_probabilities(self, sentence_aligned_corpus):
  113. initial_prob = 1 / len(self.trg_vocab)
  114. if initial_prob < IBMModel.MIN_PROB:
  115. warnings.warn(
  116. "Target language vocabulary is too large ("
  117. + str(len(self.trg_vocab))
  118. + " words). "
  119. "Results may be less accurate."
  120. )
  121. for t in self.trg_vocab:
  122. self.translation_table[t] = defaultdict(lambda: initial_prob)
  123. def train(self, parallel_corpus):
  124. counts = Counts()
  125. for aligned_sentence in parallel_corpus:
  126. trg_sentence = aligned_sentence.words
  127. src_sentence = [None] + aligned_sentence.mots
  128. # E step (a): Compute normalization factors to weigh counts
  129. total_count = self.prob_all_alignments(src_sentence, trg_sentence)
  130. # E step (b): Collect counts
  131. for t in trg_sentence:
  132. for s in src_sentence:
  133. count = self.prob_alignment_point(s, t)
  134. normalized_count = count / total_count[t]
  135. counts.t_given_s[t][s] += normalized_count
  136. counts.any_t_given_s[s] += normalized_count
  137. # M step: Update probabilities with maximum likelihood estimate
  138. self.maximize_lexical_translation_probabilities(counts)
  139. def prob_all_alignments(self, src_sentence, trg_sentence):
  140. """
  141. Computes the probability of all possible word alignments,
  142. expressed as a marginal distribution over target words t
  143. Each entry in the return value represents the contribution to
  144. the total alignment probability by the target word t.
  145. To obtain probability(alignment | src_sentence, trg_sentence),
  146. simply sum the entries in the return value.
  147. :return: Probability of t for all s in ``src_sentence``
  148. :rtype: dict(str): float
  149. """
  150. alignment_prob_for_t = defaultdict(lambda: 0.0)
  151. for t in trg_sentence:
  152. for s in src_sentence:
  153. alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
  154. return alignment_prob_for_t
  155. def prob_alignment_point(self, s, t):
  156. """
  157. Probability that word ``t`` in the target sentence is aligned to
  158. word ``s`` in the source sentence
  159. """
  160. return self.translation_table[t][s]
  161. def prob_t_a_given_s(self, alignment_info):
  162. """
  163. Probability of target sentence and an alignment given the
  164. source sentence
  165. """
  166. prob = 1.0
  167. for j, i in enumerate(alignment_info.alignment):
  168. if j == 0:
  169. continue # skip the dummy zeroeth element
  170. trg_word = alignment_info.trg_sentence[j]
  171. src_word = alignment_info.src_sentence[i]
  172. prob *= self.translation_table[trg_word][src_word]
  173. return max(prob, IBMModel.MIN_PROB)
  174. def align_all(self, parallel_corpus):
  175. for sentence_pair in parallel_corpus:
  176. self.align(sentence_pair)
  177. def align(self, sentence_pair):
  178. """
  179. Determines the best word alignment for one sentence pair from
  180. the corpus that the model was trained on.
  181. The best alignment will be set in ``sentence_pair`` when the
  182. method returns. In contrast with the internal implementation of
  183. IBM models, the word indices in the ``Alignment`` are zero-
  184. indexed, not one-indexed.
  185. :param sentence_pair: A sentence in the source language and its
  186. counterpart sentence in the target language
  187. :type sentence_pair: AlignedSent
  188. """
  189. best_alignment = []
  190. for j, trg_word in enumerate(sentence_pair.words):
  191. # Initialize trg_word to align with the NULL token
  192. best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
  193. best_alignment_point = None
  194. for i, src_word in enumerate(sentence_pair.mots):
  195. align_prob = self.translation_table[trg_word][src_word]
  196. if align_prob >= best_prob: # prefer newer word in case of tie
  197. best_prob = align_prob
  198. best_alignment_point = i
  199. best_alignment.append((j, best_alignment_point))
  200. sentence_pair.alignment = Alignment(best_alignment)