phrase_based.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Phrase Extraction Algorithm
  3. #
  4. # Copyright (C) 2001-2020 NLTK Project
  5. # Authors: Liling Tan, Fredrik Hedman, Petra Barancikova
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. def extract(
  9. f_start,
  10. f_end,
  11. e_start,
  12. e_end,
  13. alignment,
  14. f_aligned,
  15. srctext,
  16. trgtext,
  17. srclen,
  18. trglen,
  19. max_phrase_length,
  20. ):
  21. """
  22. This function checks for alignment point consistency and extracts
  23. phrases using the chunk of consistent phrases.
  24. A phrase pair (e, f ) is consistent with an alignment A if and only if:
  25. (i) No English words in the phrase pair are aligned to words outside it.
  26. ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f
  27. (ii) No Foreign words in the phrase pair are aligned to words outside it.
  28. ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e
  29. (iii) The phrase pair contains at least one alignment point.
  30. ∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A
  31. :type f_start: int
  32. :param f_start: Starting index of the possible foreign language phrases
  33. :type f_end: int
  34. :param f_end: End index of the possible foreign language phrases
  35. :type e_start: int
  36. :param e_start: Starting index of the possible source language phrases
  37. :type e_end: int
  38. :param e_end: End index of the possible source language phrases
  39. :type srctext: list
  40. :param srctext: The source language tokens, a list of string.
  41. :type trgtext: list
  42. :param trgtext: The target language tokens, a list of string.
  43. :type srclen: int
  44. :param srclen: The number of tokens in the source language tokens.
  45. :type trglen: int
  46. :param trglen: The number of tokens in the target language tokens.
  47. """
  48. if f_end < 0: # 0-based indexing.
  49. return {}
  50. # Check if alignment points are consistent.
  51. for e, f in alignment:
  52. if (f_start <= f <= f_end) and (e < e_start or e > e_end):
  53. return {}
  54. # Add phrase pairs (incl. additional unaligned f)
  55. phrases = set()
  56. fs = f_start
  57. while True:
  58. fe = min(f_end, f_start + max_phrase_length - 1)
  59. while True:
  60. # add phrase pair ([e_start, e_end], [fs, fe]) to set E
  61. # Need to +1 in range to include the end-point.
  62. src_phrase = " ".join(srctext[e_start : e_end + 1])
  63. trg_phrase = " ".join(trgtext[fs : fe + 1])
  64. # Include more data for later ordering.
  65. phrases.add(
  66. ((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)
  67. )
  68. fe += 1
  69. if fe in f_aligned or fe >= trglen:
  70. break
  71. fs -= 1
  72. if fs in f_aligned or fs < 0:
  73. break
  74. return phrases
  75. def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0):
  76. """
  77. Phrase extraction algorithm extracts all consistent phrase pairs from
  78. a word-aligned sentence pair.
  79. The idea is to loop over all possible source language (e) phrases and find
  80. the minimal foreign phrase (f) that matches each of them. Matching is done
  81. by identifying all alignment points for the source phrase and finding the
  82. shortest foreign phrase that includes all the foreign counterparts for the
  83. source words.
  84. In short, a phrase alignment has to
  85. (a) contain all alignment points for all covered words
  86. (b) contain at least one alignment point
  87. >>> srctext = "michael assumes that he will stay in the house"
  88. >>> trgtext = "michael geht davon aus , dass er im haus bleibt"
  89. >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9),
  90. ... (5,9), (6,7), (7,7), (8,8)]
  91. >>> phrases = phrase_extraction(srctext, trgtext, alignment)
  92. >>> for i in sorted(phrases):
  93. ... print(i)
  94. ...
  95. ((0, 1), (0, 1), 'michael', 'michael')
  96. ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus')
  97. ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,')
  98. ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass')
  99. ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er')
  100. ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt')
  101. ((1, 2), (1, 4), 'assumes', 'geht davon aus')
  102. ((1, 2), (1, 5), 'assumes', 'geht davon aus ,')
  103. ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass')
  104. ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er')
  105. ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt')
  106. ((2, 3), (4, 6), 'that', ', dass')
  107. ((2, 3), (5, 6), 'that', 'dass')
  108. ((2, 4), (4, 7), 'that he', ', dass er')
  109. ((2, 4), (5, 7), 'that he', 'dass er')
  110. ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt')
  111. ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt')
  112. ((3, 4), (6, 7), 'he', 'er')
  113. ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt')
  114. ((4, 6), (9, 10), 'will stay', 'bleibt')
  115. ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt')
  116. ((6, 8), (7, 8), 'in the', 'im')
  117. ((6, 9), (7, 9), 'in the house', 'im haus')
  118. ((8, 9), (8, 9), 'house', 'haus')
  119. :type srctext: str
  120. :param srctext: The sentence string from the source language.
  121. :type trgtext: str
  122. :param trgtext: The sentence string from the target language.
  123. :type alignment: list(tuple)
  124. :param alignment: The word alignment outputs as list of tuples, where
  125. the first elements of tuples are the source words' indices and
  126. second elements are the target words' indices. This is also the output
  127. format of nltk.translate.ibm1
  128. :rtype: list(tuple)
  129. :return: A list of tuples, each element in a list is a phrase and each
  130. phrase is a tuple made up of (i) its source location, (ii) its target
  131. location, (iii) the source phrase and (iii) the target phrase. The phrase
  132. list of tuples represents all the possible phrases extracted from the
  133. word alignments.
  134. :type max_phrase_length: int
  135. :param max_phrase_length: maximal phrase length, if 0 or not specified
  136. it is set to a length of the longer sentence (srctext or trgtext).
  137. """
  138. srctext = srctext.split() # e
  139. trgtext = trgtext.split() # f
  140. srclen = len(srctext) # len(e)
  141. trglen = len(trgtext) # len(f)
  142. # Keeps an index of which source/target words that are aligned.
  143. f_aligned = [j for _, j in alignment]
  144. max_phrase_length = max_phrase_length or max(srclen, trglen)
  145. # set of phrase pairs BP
  146. bp = set()
  147. for e_start in range(srclen):
  148. max_idx = min(srclen, e_start + max_phrase_length)
  149. for e_end in range(e_start, max_idx):
  150. # // find the minimally matching foreign phrase
  151. # (f start , f end ) = ( length(f), 0 )
  152. # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
  153. f_start, f_end = trglen - 1, -1 # 0-based indexing
  154. for e, f in alignment:
  155. if e_start <= e <= e_end:
  156. f_start = min(f, f_start)
  157. f_end = max(f, f_end)
  158. # add extract (f start , f end , e start , e end ) to set BP
  159. phrases = extract(
  160. f_start,
  161. f_end,
  162. e_start,
  163. e_end,
  164. alignment,
  165. f_aligned,
  166. srctext,
  167. trgtext,
  168. srclen,
  169. trglen,
  170. max_phrase_length,
  171. )
  172. if phrases:
  173. bp.update(phrases)
  174. return bp