| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: IBM Model Core
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Tah Wei Hoon <hoon.tw@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Common methods and classes for all IBM models. See ``IBMModel1``,
- ``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5``
- for specific implementations.
- The IBM models are a series of generative models that learn lexical
- translation probabilities, p(target language word|source language word),
- given a sentence-aligned parallel corpus.
- The models increase in sophistication from model 1 to 5. Typically, the
- output of lower models is used to seed the higher models. All models
- use the Expectation-Maximization (EM) algorithm to learn various
- probability tables.
- Words in a sentence are one-indexed. The first word of a sentence has
- position 1, not 0. Index 0 is reserved in the source sentence for the
- NULL token. The concept of position does not apply to NULL, but it is
- indexed at 0 by convention.
- Each target word is aligned to exactly one source word or the NULL
- token.
- References:
- Philipp Koehn. 2010. Statistical Machine Translation.
- Cambridge University Press, New York.
- Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
- Robert L. Mercer. 1993. The Mathematics of Statistical Machine
- Translation: Parameter Estimation. Computational Linguistics, 19 (2),
- 263-311.
- """
- from bisect import insort_left
- from collections import defaultdict
- from copy import deepcopy
- from math import ceil
- def longest_target_sentence_length(sentence_aligned_corpus):
- """
- :param sentence_aligned_corpus: Parallel corpus under consideration
- :type sentence_aligned_corpus: list(AlignedSent)
- :return: Number of words in the longest target language sentence
- of ``sentence_aligned_corpus``
- """
- max_m = 0
- for aligned_sentence in sentence_aligned_corpus:
- m = len(aligned_sentence.words)
- max_m = max(m, max_m)
- return max_m
- class IBMModel(object):
- """
- Abstract base class for all IBM models
- """
- # Avoid division by zero and precision errors by imposing a minimum
- # value for probabilities. Note that this approach is theoretically
- # incorrect, since it may create probabilities that sum to more
- # than 1. In practice, the contribution of probabilities with MIN_PROB
- # is tiny enough that the value of MIN_PROB can be treated as zero.
- MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7
- def __init__(self, sentence_aligned_corpus):
- self.init_vocab(sentence_aligned_corpus)
- self.reset_probabilities()
- def reset_probabilities(self):
- self.translation_table = defaultdict(
- lambda: defaultdict(lambda: IBMModel.MIN_PROB)
- )
- """
- dict[str][str]: float. Probability(target word | source word).
- Values accessed as ``translation_table[target_word][source_word]``.
- """
- self.alignment_table = defaultdict(
- lambda: defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB))
- )
- )
- """
- dict[int][int][int][int]: float. Probability(i | j,l,m).
- Values accessed as ``alignment_table[i][j][l][m]``.
- Used in model 2 and hill climbing in models 3 and above
- """
- self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- """
- dict[int][str]: float. Probability(fertility | source word).
- Values accessed as ``fertility_table[fertility][source_word]``.
- Used in model 3 and higher.
- """
- self.p1 = 0.5
- """
- Probability that a generated word requires another target word
- that is aligned to NULL.
- Used in model 3 and higher.
- """
- def set_uniform_probabilities(self, sentence_aligned_corpus):
- """
- Initialize probability tables to a uniform distribution
- Derived classes should implement this accordingly.
- """
- pass
- def init_vocab(self, sentence_aligned_corpus):
- src_vocab = set()
- trg_vocab = set()
- for aligned_sentence in sentence_aligned_corpus:
- trg_vocab.update(aligned_sentence.words)
- src_vocab.update(aligned_sentence.mots)
- # Add the NULL token
- src_vocab.add(None)
- self.src_vocab = src_vocab
- """
- set(str): All source language words used in training
- """
- self.trg_vocab = trg_vocab
- """
- set(str): All target language words used in training
- """
- def sample(self, sentence_pair):
- """
- Sample the most probable alignments from the entire alignment
- space
- First, determine the best alignment according to IBM Model 2.
- With this initial alignment, use hill climbing to determine the
- best alignment according to a higher IBM Model. Add this
- alignment and its neighbors to the sample set. Repeat this
- process with other initial alignments obtained by pegging an
- alignment point.
- Hill climbing may be stuck in a local maxima, hence the pegging
- and trying out of different alignments.
- :param sentence_pair: Source and target language sentence pair
- to generate a sample of alignments from
- :type sentence_pair: AlignedSent
- :return: A set of best alignments represented by their ``AlignmentInfo``
- and the best alignment of the set for convenience
- :rtype: set(AlignmentInfo), AlignmentInfo
- """
- sampled_alignments = set()
- l = len(sentence_pair.mots)
- m = len(sentence_pair.words)
- # Start from the best model 2 alignment
- initial_alignment = self.best_model2_alignment(sentence_pair)
- potential_alignment = self.hillclimb(initial_alignment)
- sampled_alignments.update(self.neighboring(potential_alignment))
- best_alignment = potential_alignment
- # Start from other model 2 alignments,
- # with the constraint that j is aligned (pegged) to i
- for j in range(1, m + 1):
- for i in range(0, l + 1):
- initial_alignment = self.best_model2_alignment(sentence_pair, j, i)
- potential_alignment = self.hillclimb(initial_alignment, j)
- neighbors = self.neighboring(potential_alignment, j)
- sampled_alignments.update(neighbors)
- if potential_alignment.score > best_alignment.score:
- best_alignment = potential_alignment
- return sampled_alignments, best_alignment
- def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0):
- """
- Finds the best alignment according to IBM Model 2
- Used as a starting point for hill climbing in Models 3 and
- above, because it is easier to compute than the best alignments
- in higher models
- :param sentence_pair: Source and target language sentence pair
- to be word-aligned
- :type sentence_pair: AlignedSent
- :param j_pegged: If specified, the alignment point of j_pegged
- will be fixed to i_pegged
- :type j_pegged: int
- :param i_pegged: Alignment point to j_pegged
- :type i_pegged: int
- """
- src_sentence = [None] + sentence_pair.mots
- trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed
- l = len(src_sentence) - 1 # exclude NULL
- m = len(trg_sentence) - 1
- alignment = [0] * (m + 1) # init all alignments to NULL
- cepts = [[] for i in range((l + 1))] # init all cepts to empty list
- for j in range(1, m + 1):
- if j == j_pegged:
- # use the pegged alignment instead of searching for best one
- best_i = i_pegged
- else:
- best_i = 0
- max_alignment_prob = IBMModel.MIN_PROB
- t = trg_sentence[j]
- for i in range(0, l + 1):
- s = src_sentence[i]
- alignment_prob = (
- self.translation_table[t][s] * self.alignment_table[i][j][l][m]
- )
- if alignment_prob >= max_alignment_prob:
- max_alignment_prob = alignment_prob
- best_i = i
- alignment[j] = best_i
- cepts[best_i].append(j)
- return AlignmentInfo(
- tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts
- )
- def hillclimb(self, alignment_info, j_pegged=None):
- """
- Starting from the alignment in ``alignment_info``, look at
- neighboring alignments iteratively for the best one
- There is no guarantee that the best alignment in the alignment
- space will be found, because the algorithm might be stuck in a
- local maximum.
- :param j_pegged: If specified, the search will be constrained to
- alignments where ``j_pegged`` remains unchanged
- :type j_pegged: int
- :return: The best alignment found from hill climbing
- :rtype: AlignmentInfo
- """
- alignment = alignment_info # alias with shorter name
- max_probability = self.prob_t_a_given_s(alignment)
- while True:
- old_alignment = alignment
- for neighbor_alignment in self.neighboring(alignment, j_pegged):
- neighbor_probability = self.prob_t_a_given_s(neighbor_alignment)
- if neighbor_probability > max_probability:
- alignment = neighbor_alignment
- max_probability = neighbor_probability
- if alignment == old_alignment:
- # Until there are no better alignments
- break
- alignment.score = max_probability
- return alignment
- def neighboring(self, alignment_info, j_pegged=None):
- """
- Determine the neighbors of ``alignment_info``, obtained by
- moving or swapping one alignment point
- :param j_pegged: If specified, neighbors that have a different
- alignment point from j_pegged will not be considered
- :type j_pegged: int
- :return: A set neighboring alignments represented by their
- ``AlignmentInfo``
- :rtype: set(AlignmentInfo)
- """
- neighbors = set()
- l = len(alignment_info.src_sentence) - 1 # exclude NULL
- m = len(alignment_info.trg_sentence) - 1
- original_alignment = alignment_info.alignment
- original_cepts = alignment_info.cepts
- for j in range(1, m + 1):
- if j != j_pegged:
- # Add alignments that differ by one alignment point
- for i in range(0, l + 1):
- new_alignment = list(original_alignment)
- new_cepts = deepcopy(original_cepts)
- old_i = original_alignment[j]
- # update alignment
- new_alignment[j] = i
- # update cepts
- insort_left(new_cepts[i], j)
- new_cepts[old_i].remove(j)
- new_alignment_info = AlignmentInfo(
- tuple(new_alignment),
- alignment_info.src_sentence,
- alignment_info.trg_sentence,
- new_cepts,
- )
- neighbors.add(new_alignment_info)
- for j in range(1, m + 1):
- if j != j_pegged:
- # Add alignments that have two alignment points swapped
- for other_j in range(1, m + 1):
- if other_j != j_pegged and other_j != j:
- new_alignment = list(original_alignment)
- new_cepts = deepcopy(original_cepts)
- other_i = original_alignment[other_j]
- i = original_alignment[j]
- # update alignments
- new_alignment[j] = other_i
- new_alignment[other_j] = i
- # update cepts
- new_cepts[other_i].remove(other_j)
- insort_left(new_cepts[other_i], j)
- new_cepts[i].remove(j)
- insort_left(new_cepts[i], other_j)
- new_alignment_info = AlignmentInfo(
- tuple(new_alignment),
- alignment_info.src_sentence,
- alignment_info.trg_sentence,
- new_cepts,
- )
- neighbors.add(new_alignment_info)
- return neighbors
- def maximize_lexical_translation_probabilities(self, counts):
- for t, src_words in counts.t_given_s.items():
- for s in src_words:
- estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s]
- self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB)
- def maximize_fertility_probabilities(self, counts):
- for phi, src_words in counts.fertility.items():
- for s in src_words:
- estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s]
- self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB)
- def maximize_null_generation_probabilities(self, counts):
- p1_estimate = counts.p1 / (counts.p1 + counts.p0)
- p1_estimate = max(p1_estimate, IBMModel.MIN_PROB)
- # Clip p1 if it is too large, because p0 = 1 - p1 should not be
- # smaller than MIN_PROB
- self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB)
- def prob_of_alignments(self, alignments):
- probability = 0
- for alignment_info in alignments:
- probability += self.prob_t_a_given_s(alignment_info)
- return probability
- def prob_t_a_given_s(self, alignment_info):
- """
- Probability of target sentence and an alignment given the
- source sentence
- All required information is assumed to be in ``alignment_info``
- and self.
- Derived classes should override this method
- """
- return 0.0
- class AlignmentInfo(object):
- """
- Helper data object for training IBM Models 3 and up
- Read-only. For a source sentence and its counterpart in the target
- language, this class holds information about the sentence pair's
- alignment, cepts, and fertility.
- Warning: Alignments are one-indexed here, in contrast to
- nltk.translate.Alignment and AlignedSent, which are zero-indexed
- This class is not meant to be used outside of IBM models.
- """
- def __init__(self, alignment, src_sentence, trg_sentence, cepts):
- if not isinstance(alignment, tuple):
- raise TypeError(
- "The alignment must be a tuple because it is used "
- "to uniquely identify AlignmentInfo objects."
- )
- self.alignment = alignment
- """
- tuple(int): Alignment function. ``alignment[j]`` is the position
- in the source sentence that is aligned to the position j in the
- target sentence.
- """
- self.src_sentence = src_sentence
- """
- tuple(str): Source sentence referred to by this object.
- Should include NULL token (None) in index 0.
- """
- self.trg_sentence = trg_sentence
- """
- tuple(str): Target sentence referred to by this object.
- Should have a dummy element in index 0 so that the first word
- starts from index 1.
- """
- self.cepts = cepts
- """
- list(list(int)): The positions of the target words, in
- ascending order, aligned to a source word position. For example,
- cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7
- of the target sentence are aligned to the word in position 4 of
- the source sentence
- """
- self.score = None
- """
- float: Optional. Probability of alignment, as defined by the
- IBM model that assesses this alignment
- """
- def fertility_of_i(self, i):
- """
- Fertility of word in position ``i`` of the source sentence
- """
- return len(self.cepts[i])
- def is_head_word(self, j):
- """
- :return: Whether the word in position ``j`` of the target
- sentence is a head word
- """
- i = self.alignment[j]
- return self.cepts[i][0] == j
- def center_of_cept(self, i):
- """
- :return: The ceiling of the average positions of the words in
- the tablet of cept ``i``, or 0 if ``i`` is None
- """
- if i is None:
- return 0
- average_position = sum(self.cepts[i]) / len(self.cepts[i])
- return int(ceil(average_position))
- def previous_cept(self, j):
- """
- :return: The previous cept of ``j``, or None if ``j`` belongs to
- the first cept
- """
- i = self.alignment[j]
- if i == 0:
- raise ValueError(
- "Words aligned to NULL cannot have a previous "
- "cept because NULL has no position"
- )
- previous_cept = i - 1
- while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0:
- previous_cept -= 1
- if previous_cept <= 0:
- previous_cept = None
- return previous_cept
- def previous_in_tablet(self, j):
- """
- :return: The position of the previous word that is in the same
- tablet as ``j``, or None if ``j`` is the first word of the
- tablet
- """
- i = self.alignment[j]
- tablet_position = self.cepts[i].index(j)
- if tablet_position == 0:
- return None
- return self.cepts[i][tablet_position - 1]
- def zero_indexed_alignment(self):
- """
- :return: Zero-indexed alignment, suitable for use in external
- ``nltk.translate`` modules like ``nltk.translate.Alignment``
- :rtype: list(tuple)
- """
- zero_indexed_alignment = []
- for j in range(1, len(self.trg_sentence)):
- i = self.alignment[j] - 1
- if i < 0:
- i = None # alignment to NULL token
- zero_indexed_alignment.append((j - 1, i))
- return zero_indexed_alignment
- def __eq__(self, other):
- return self.alignment == other.alignment
- def __ne__(self, other):
- return not self == other
- def __hash__(self):
- return hash(self.alignment)
- class Counts(object):
- """
- Data object to store counts of various parameters during training
- """
- def __init__(self):
- self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
- self.any_t_given_s = defaultdict(lambda: 0.0)
- self.p0 = 0.0
- self.p1 = 0.0
- self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0))
- self.fertility_for_any_phi = defaultdict(lambda: 0.0)
- def update_lexical_translation(self, count, alignment_info, j):
- i = alignment_info.alignment[j]
- t = alignment_info.trg_sentence[j]
- s = alignment_info.src_sentence[i]
- self.t_given_s[t][s] += count
- self.any_t_given_s[s] += count
- def update_null_generation(self, count, alignment_info):
- m = len(alignment_info.trg_sentence) - 1
- fertility_of_null = alignment_info.fertility_of_i(0)
- self.p1 += fertility_of_null * count
- self.p0 += (m - 2 * fertility_of_null) * count
- def update_fertility(self, count, alignment_info):
- for i in range(0, len(alignment_info.src_sentence)):
- s = alignment_info.src_sentence[i]
- phi = alignment_info.fertility_of_i(i)
- self.fertility[phi][s] += count
- self.fertility_for_any_phi[s] += count
|