| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Gale-Church Aligner
- #
- # Copyright (C) 2001-2020 NLTK Project
- # Author: Torsten Marek <marek@ifi.uzh.ch>
- # Contributor: Cassidy Laidlaw, Liling Tan
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A port of the Gale-Church Aligner.
- Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora.
- http://aclweb.org/anthology/J93-1004.pdf
- """
- import math
- try:
- from scipy.stats import norm
- from norm import logsf as norm_logsf
- except ImportError:
- def erfcc(x):
- """Complementary error function."""
- z = abs(x)
- t = 1 / (1 + 0.5 * z)
- r = t * math.exp(
- -z * z
- - 1.26551223
- + t
- * (
- 1.00002368
- + t
- * (
- 0.37409196
- + t
- * (
- 0.09678418
- + t
- * (
- -0.18628806
- + t
- * (
- 0.27886807
- + t
- * (
- -1.13520398
- + t
- * (1.48851587 + t * (-0.82215223 + t * 0.17087277))
- )
- )
- )
- )
- )
- )
- )
- if x >= 0.0:
- return r
- else:
- return 2.0 - r
- def norm_cdf(x):
- """Return the area under the normal distribution from M{-∞..x}."""
- return 1 - 0.5 * erfcc(x / math.sqrt(2))
- def norm_logsf(x):
- try:
- return math.log(1 - norm_cdf(x))
- except ValueError:
- return float("-inf")
- LOG2 = math.log(2)
- class LanguageIndependent(object):
- # These are the language-independent probabilities and parameters
- # given in Gale & Church
- # for the computation, l_1 is always the language with less characters
- PRIORS = {
- (1, 0): 0.0099,
- (0, 1): 0.0099,
- (1, 1): 0.89,
- (2, 1): 0.089,
- (1, 2): 0.089,
- (2, 2): 0.011,
- }
- AVERAGE_CHARACTERS = 1
- VARIANCE_CHARACTERS = 6.8
- def trace(backlinks, source_sents_lens, target_sents_lens):
- """
- Traverse the alignment cost from the tracebacks and retrieves
- appropriate sentence pairs.
- :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS)
- :type backlinks: dict
- :param source_sents_lens: A list of target sentences' lengths
- :type source_sents_lens: list(int)
- :param target_sents_lens: A list of target sentences' lengths
- :type target_sents_lens: list(int)
- """
- links = []
- position = (len(source_sents_lens), len(target_sents_lens))
- while position != (0, 0) and all(p >= 0 for p in position):
- try:
- s, t = backlinks[position]
- except TypeError:
- position = (position[0] - 1, position[1] - 1)
- continue
- for i in range(s):
- for j in range(t):
- links.append((position[0] - i - 1, position[1] - j - 1))
- position = (position[0] - s, position[1] - t)
- return links[::-1]
- def align_log_prob(i, j, source_sents, target_sents, alignment, params):
- """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]}
- being aligned with a specific C{alignment}.
- @param i: The offset of the source sentence.
- @param j: The offset of the target sentence.
- @param source_sents: The list of source sentence lengths.
- @param target_sents: The list of target sentence lengths.
- @param alignment: The alignment type, a tuple of two integers.
- @param params: The sentence alignment parameters.
- @returns: The log probability of a specific alignment between the two sentences, given the parameters.
- """
- l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0]))
- l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1]))
- try:
- # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C
- # reference implementation. With l_s in the denominator, insertions are impossible.
- m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2
- delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(
- m * params.VARIANCE_CHARACTERS
- )
- except ZeroDivisionError:
- return float("-inf")
- return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment]))
- def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent):
- """Return the sentence alignment of two text blocks (usually paragraphs).
- >>> align_blocks([5,5,5], [7,7,7])
- [(0, 0), (1, 1), (2, 2)]
- >>> align_blocks([10,5,5], [12,20])
- [(0, 0), (1, 1), (2, 1)]
- >>> align_blocks([12,20], [10,5,5])
- [(0, 0), (1, 1), (1, 2)]
- >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12])
- [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)]
- @param source_sents_lens: The list of source sentence lengths.
- @param target_sents_lens: The list of target sentence lengths.
- @param params: the sentence alignment parameters.
- @return: The sentence alignments, a list of index pairs.
- """
- alignment_types = list(params.PRIORS.keys())
- # there are always three rows in the history (with the last of them being filled)
- D = [[]]
- backlinks = {}
- for i in range(len(source_sents_lens) + 1):
- for j in range(len(target_sents_lens) + 1):
- min_dist = float("inf")
- min_align = None
- for a in alignment_types:
- prev_i = -1 - a[0]
- prev_j = j - a[1]
- if prev_i < -len(D) or prev_j < 0:
- continue
- p = D[prev_i][prev_j] + align_log_prob(
- i, j, source_sents_lens, target_sents_lens, a, params
- )
- if p < min_dist:
- min_dist = p
- min_align = a
- if min_dist == float("inf"):
- min_dist = 0
- backlinks[(i, j)] = min_align
- D[-1].append(min_dist)
- if len(D) > 2:
- D.pop(0)
- D.append([])
- return trace(backlinks, source_sents_lens, target_sents_lens)
- def align_texts(source_blocks, target_blocks, params=LanguageIndependent):
- """Creates the sentence alignment of two texts.
- Texts can consist of several blocks. Block boundaries cannot be crossed by sentence
- alignment links.
- Each block consists of a list that contains the lengths (in characters) of the sentences
- in this block.
- @param source_blocks: The list of blocks in the source text.
- @param target_blocks: The list of blocks in the target text.
- @param params: the sentence alignment parameters.
- @returns: A list of sentence alignment lists
- """
- if len(source_blocks) != len(target_blocks):
- raise ValueError(
- "Source and target texts do not have the same number of blocks."
- )
- return [
- align_blocks(source_block, target_block, params)
- for source_block, target_block in zip(source_blocks, target_blocks)
- ]
- # File I/O functions; may belong in a corpus reader
- def split_at(it, split_value):
- """Splits an iterator C{it} at values of C{split_value}.
- Each instance of C{split_value} is swallowed. The iterator produces
- subiterators which need to be consumed fully before the next subiterator
- can be used.
- """
- def _chunk_iterator(first):
- v = first
- while v != split_value:
- yield v
- v = it.next()
- while True:
- yield _chunk_iterator(it.next())
- def parse_token_stream(stream, soft_delimiter, hard_delimiter):
- """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens)
- and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function.
- """
- return [
- [
- sum(len(token) for token in sentence_it)
- for sentence_it in split_at(block_it, soft_delimiter)
- ]
- for block_it in split_at(stream, hard_delimiter)
- ]
|