test_ibm_model.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for common methods of IBM translation models
  4. """
  5. import unittest
  6. from collections import defaultdict
  7. from nltk.translate import AlignedSent
  8. from nltk.translate import IBMModel
  9. from nltk.translate.ibm_model import AlignmentInfo
  10. class TestIBMModel(unittest.TestCase):
  11. __TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon']
  12. __TEST_TRG_SENTENCE = ['i', 'love', 'ham']
  13. def test_vocabularies_are_initialized(self):
  14. parallel_corpora = [
  15. AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']),
  16. AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
  17. AlignedSent([], ['sept']),
  18. ]
  19. ibm_model = IBMModel(parallel_corpora)
  20. self.assertEqual(len(ibm_model.src_vocab), 8)
  21. self.assertEqual(len(ibm_model.trg_vocab), 6)
  22. def test_vocabularies_are_initialized_even_with_empty_corpora(self):
  23. parallel_corpora = []
  24. ibm_model = IBMModel(parallel_corpora)
  25. self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token
  26. self.assertEqual(len(ibm_model.trg_vocab), 0)
  27. def test_best_model2_alignment(self):
  28. # arrange
  29. sentence_pair = AlignedSent(
  30. TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
  31. )
  32. # None and 'bien' have zero fertility
  33. translation_table = {
  34. 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
  35. 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
  36. 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
  37. }
  38. alignment_table = defaultdict(
  39. lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
  40. )
  41. ibm_model = IBMModel([])
  42. ibm_model.translation_table = translation_table
  43. ibm_model.alignment_table = alignment_table
  44. # act
  45. a_info = ibm_model.best_model2_alignment(sentence_pair)
  46. # assert
  47. self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused
  48. self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
  49. def test_best_model2_alignment_does_not_change_pegged_alignment(self):
  50. # arrange
  51. sentence_pair = AlignedSent(
  52. TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
  53. )
  54. translation_table = {
  55. 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
  56. 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
  57. 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
  58. }
  59. alignment_table = defaultdict(
  60. lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
  61. )
  62. ibm_model = IBMModel([])
  63. ibm_model.translation_table = translation_table
  64. ibm_model.alignment_table = alignment_table
  65. # act: force 'love' to be pegged to 'jambon'
  66. a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4)
  67. # assert
  68. self.assertEqual(a_info.alignment[1:], (1, 4, 4))
  69. self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]])
  70. def test_best_model2_alignment_handles_fertile_words(self):
  71. # arrange
  72. sentence_pair = AlignedSent(
  73. ['i', 'really', ',', 'really', 'love', 'ham'],
  74. TestIBMModel.__TEST_SRC_SENTENCE,
  75. )
  76. # 'bien' produces 2 target words: 'really' and another 'really'
  77. translation_table = {
  78. 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0},
  79. 'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09},
  80. ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7},
  81. 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03},
  82. 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0},
  83. }
  84. alignment_table = defaultdict(
  85. lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2)))
  86. )
  87. ibm_model = IBMModel([])
  88. ibm_model.translation_table = translation_table
  89. ibm_model.alignment_table = alignment_table
  90. # act
  91. a_info = ibm_model.best_model2_alignment(sentence_pair)
  92. # assert
  93. self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
  94. self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
  95. def test_best_model2_alignment_handles_empty_src_sentence(self):
  96. # arrange
  97. sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
  98. ibm_model = IBMModel([])
  99. # act
  100. a_info = ibm_model.best_model2_alignment(sentence_pair)
  101. # assert
  102. self.assertEqual(a_info.alignment[1:], (0, 0, 0))
  103. self.assertEqual(a_info.cepts, [[1, 2, 3]])
  104. def test_best_model2_alignment_handles_empty_trg_sentence(self):
  105. # arrange
  106. sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
  107. ibm_model = IBMModel([])
  108. # act
  109. a_info = ibm_model.best_model2_alignment(sentence_pair)
  110. # assert
  111. self.assertEqual(a_info.alignment[1:], ())
  112. self.assertEqual(a_info.cepts, [[], [], [], [], []])
  113. def test_neighboring_finds_neighbor_alignments(self):
  114. # arrange
  115. a_info = AlignmentInfo(
  116. (0, 3, 2),
  117. (None, 'des', 'œufs', 'verts'),
  118. ('UNUSED', 'green', 'eggs'),
  119. [[], [], [2], [1]],
  120. )
  121. ibm_model = IBMModel([])
  122. # act
  123. neighbors = ibm_model.neighboring(a_info)
  124. # assert
  125. neighbor_alignments = set()
  126. for neighbor in neighbors:
  127. neighbor_alignments.add(neighbor.alignment)
  128. expected_alignments = set(
  129. [
  130. # moves
  131. (0, 0, 2),
  132. (0, 1, 2),
  133. (0, 2, 2),
  134. (0, 3, 0),
  135. (0, 3, 1),
  136. (0, 3, 3),
  137. # swaps
  138. (0, 2, 3),
  139. # original alignment
  140. (0, 3, 2),
  141. ]
  142. )
  143. self.assertEqual(neighbor_alignments, expected_alignments)
  144. def test_neighboring_sets_neighbor_alignment_info(self):
  145. # arrange
  146. a_info = AlignmentInfo(
  147. (0, 3, 2),
  148. (None, 'des', 'œufs', 'verts'),
  149. ('UNUSED', 'green', 'eggs'),
  150. [[], [], [2], [1]],
  151. )
  152. ibm_model = IBMModel([])
  153. # act
  154. neighbors = ibm_model.neighboring(a_info)
  155. # assert: select a few particular alignments
  156. for neighbor in neighbors:
  157. if neighbor.alignment == (0, 2, 2):
  158. moved_alignment = neighbor
  159. elif neighbor.alignment == (0, 3, 2):
  160. swapped_alignment = neighbor
  161. self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []])
  162. self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]])
  163. def test_neighboring_returns_neighbors_with_pegged_alignment(self):
  164. # arrange
  165. a_info = AlignmentInfo(
  166. (0, 3, 2),
  167. (None, 'des', 'œufs', 'verts'),
  168. ('UNUSED', 'green', 'eggs'),
  169. [[], [], [2], [1]],
  170. )
  171. ibm_model = IBMModel([])
  172. # act: peg 'eggs' to align with 'œufs'
  173. neighbors = ibm_model.neighboring(a_info, 2)
  174. # assert
  175. neighbor_alignments = set()
  176. for neighbor in neighbors:
  177. neighbor_alignments.add(neighbor.alignment)
  178. expected_alignments = set(
  179. [
  180. # moves
  181. (0, 0, 2),
  182. (0, 1, 2),
  183. (0, 2, 2),
  184. # no swaps
  185. # original alignment
  186. (0, 3, 2),
  187. ]
  188. )
  189. self.assertEqual(neighbor_alignments, expected_alignments)
  190. def test_hillclimb(self):
  191. # arrange
  192. initial_alignment = AlignmentInfo((0, 3, 2), None, None, None)
  193. def neighboring_mock(a, j):
  194. if a.alignment == (0, 3, 2):
  195. return set(
  196. [
  197. AlignmentInfo((0, 2, 2), None, None, None),
  198. AlignmentInfo((0, 1, 1), None, None, None),
  199. ]
  200. )
  201. elif a.alignment == (0, 2, 2):
  202. return set(
  203. [
  204. AlignmentInfo((0, 3, 3), None, None, None),
  205. AlignmentInfo((0, 4, 4), None, None, None),
  206. ]
  207. )
  208. return set()
  209. def prob_t_a_given_s_mock(a):
  210. prob_values = {
  211. (0, 3, 2): 0.5,
  212. (0, 2, 2): 0.6,
  213. (0, 1, 1): 0.4,
  214. (0, 3, 3): 0.6,
  215. (0, 4, 4): 0.7,
  216. }
  217. return prob_values.get(a.alignment, 0.01)
  218. ibm_model = IBMModel([])
  219. ibm_model.neighboring = neighboring_mock
  220. ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock
  221. # act
  222. best_alignment = ibm_model.hillclimb(initial_alignment)
  223. # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4)
  224. self.assertEqual(best_alignment.alignment, (0, 4, 4))
  225. def test_sample(self):
  226. # arrange
  227. sentence_pair = AlignedSent(
  228. TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE
  229. )
  230. ibm_model = IBMModel([])
  231. ibm_model.prob_t_a_given_s = lambda x: 0.001
  232. # act
  233. samples, best_alignment = ibm_model.sample(sentence_pair)
  234. # assert
  235. self.assertEqual(len(samples), 61)