Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: IBM Model 3 | |
# | |
# Copyright (C) 2001-2013 NLTK Project | |
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Translation model that considers how a word can be aligned to | |
multiple words in another language. | |
IBM Model 3 improves on Model 2 by directly modeling the phenomenon | |
where a word in one language may be translated into zero or more words | |
in another. This is expressed by the fertility probability, | |
n(phi | source word). | |
If a source word translates into more than one word, it is possible to | |
generate sentences that have the same alignment in multiple ways. This | |
is modeled by a distortion step. The distortion probability, d(j|i,l,m), | |
predicts a target word position, given its aligned source word's | |
position. The distortion probability replaces the alignment probability | |
of Model 2. | |
The fertility probability is not applicable for NULL. Target words that | |
align to NULL are assumed to be distributed uniformly in the target | |
sentence. The existence of these words is modeled by p1, the probability | |
that a target word produced by a real source word requires another | |
target word that is produced by NULL. | |
The EM algorithm used in Model 3 is: | |
:E step: In the training data, collect counts, weighted by prior | |
probabilities. | |
- (a) count how many times a source language word is translated | |
into a target language word | |
- (b) count how many times a particular position in the target | |
sentence is aligned to a particular position in the source | |
sentence | |
- (c) count how many times a source word is aligned to phi number | |
of target words | |
- (d) count how many times NULL is aligned to a target word | |
:M step: Estimate new probabilities based on the counts from the E step | |
Because there are too many possible alignments, only the most probable | |
ones are considered. First, the best alignment is determined using prior | |
probabilities. Then, a hill climbing approach is used to find other good | |
candidates. | |
Notations | |
--------- | |
:i: Position in the source sentence | |
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence | |
:j: Position in the target sentence | |
Valid values are 1, 2, ..., length of target sentence | |
:l: Number of words in the source sentence, excluding NULL | |
:m: Number of words in the target sentence | |
:s: A word in the source language | |
:t: A word in the target language | |
:phi: Fertility, the number of target words produced by a source word | |
:p1: Probability that a target word produced by a source word is | |
accompanied by another target word that is aligned to NULL | |
:p0: 1 - p1 | |
References | |
---------- | |
Philipp Koehn. 2010. Statistical Machine Translation. | |
Cambridge University Press, New York. | |
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and | |
Robert L. Mercer. 1993. The Mathematics of Statistical Machine | |
Translation: Parameter Estimation. Computational Linguistics, 19 (2), | |
263-311. | |
""" | |
import warnings | |
from collections import defaultdict | |
from math import factorial | |
from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2 | |
from nltk.translate.ibm_model import Counts | |
class IBMModel3(IBMModel): | |
""" | |
Translation model that considers how a word can be aligned to | |
multiple words in another language | |
>>> bitext = [] | |
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) | |
>>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) | |
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) | |
>>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) | |
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) | |
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) | |
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) | |
>>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) | |
>>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) | |
>>> ibm3 = IBMModel3(bitext, 5) | |
>>> print(round(ibm3.translation_table['buch']['book'], 3)) | |
1.0 | |
>>> print(round(ibm3.translation_table['das']['book'], 3)) | |
0.0 | |
>>> print(round(ibm3.translation_table['ja'][None], 3)) | |
1.0 | |
>>> print(round(ibm3.distortion_table[1][1][2][2], 3)) | |
1.0 | |
>>> print(round(ibm3.distortion_table[1][2][2][2], 3)) | |
0.0 | |
>>> print(round(ibm3.distortion_table[2][2][4][5], 3)) | |
0.75 | |
>>> print(round(ibm3.fertility_table[2]['summarize'], 3)) | |
1.0 | |
>>> print(round(ibm3.fertility_table[1]['book'], 3)) | |
1.0 | |
>>> print(round(ibm3.p1, 3)) | |
0.054 | |
>>> test_sentence = bitext[2] | |
>>> test_sentence.words | |
['das', 'buch', 'ist', 'ja', 'klein'] | |
>>> test_sentence.mots | |
['the', 'book', 'is', 'small'] | |
>>> test_sentence.alignment | |
Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) | |
""" | |
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): | |
""" | |
Train on ``sentence_aligned_corpus`` and create a lexical | |
translation model, a distortion model, a fertility model, and a | |
model for generating NULL-aligned words. | |
Translation direction is from ``AlignedSent.mots`` to | |
``AlignedSent.words``. | |
:param sentence_aligned_corpus: Sentence-aligned parallel corpus | |
:type sentence_aligned_corpus: list(AlignedSent) | |
:param iterations: Number of iterations to run training algorithm | |
:type iterations: int | |
:param probability_tables: Optional. Use this to pass in custom | |
probability values. If not specified, probabilities will be | |
set to a uniform distribution, or some other sensible value. | |
If specified, all the following entries must be present: | |
``translation_table``, ``alignment_table``, | |
``fertility_table``, ``p1``, ``distortion_table``. | |
See ``IBMModel`` for the type and purpose of these tables. | |
:type probability_tables: dict[str]: object | |
""" | |
super().__init__(sentence_aligned_corpus) | |
self.reset_probabilities() | |
if probability_tables is None: | |
# Get translation and alignment probabilities from IBM Model 2 | |
ibm2 = IBMModel2(sentence_aligned_corpus, iterations) | |
self.translation_table = ibm2.translation_table | |
self.alignment_table = ibm2.alignment_table | |
self.set_uniform_probabilities(sentence_aligned_corpus) | |
else: | |
# Set user-defined probabilities | |
self.translation_table = probability_tables["translation_table"] | |
self.alignment_table = probability_tables["alignment_table"] | |
self.fertility_table = probability_tables["fertility_table"] | |
self.p1 = probability_tables["p1"] | |
self.distortion_table = probability_tables["distortion_table"] | |
for n in range(0, iterations): | |
self.train(sentence_aligned_corpus) | |
def reset_probabilities(self): | |
super().reset_probabilities() | |
self.distortion_table = defaultdict( | |
lambda: defaultdict( | |
lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) | |
) | |
) | |
""" | |
dict[int][int][int][int]: float. Probability(j | i,l,m). | |
Values accessed as ``distortion_table[j][i][l][m]``. | |
""" | |
def set_uniform_probabilities(self, sentence_aligned_corpus): | |
# d(j | i,l,m) = 1 / m for all i, j, l, m | |
l_m_combinations = set() | |
for aligned_sentence in sentence_aligned_corpus: | |
l = len(aligned_sentence.mots) | |
m = len(aligned_sentence.words) | |
if (l, m) not in l_m_combinations: | |
l_m_combinations.add((l, m)) | |
initial_prob = 1 / m | |
if initial_prob < IBMModel.MIN_PROB: | |
warnings.warn( | |
"A target sentence is too long (" | |
+ str(m) | |
+ " words). Results may be less accurate." | |
) | |
for j in range(1, m + 1): | |
for i in range(0, l + 1): | |
self.distortion_table[j][i][l][m] = initial_prob | |
# simple initialization, taken from GIZA++ | |
self.fertility_table[0] = defaultdict(lambda: 0.2) | |
self.fertility_table[1] = defaultdict(lambda: 0.65) | |
self.fertility_table[2] = defaultdict(lambda: 0.1) | |
self.fertility_table[3] = defaultdict(lambda: 0.04) | |
MAX_FERTILITY = 10 | |
initial_fert_prob = 0.01 / (MAX_FERTILITY - 4) | |
for phi in range(4, MAX_FERTILITY): | |
self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob) | |
self.p1 = 0.5 | |
def train(self, parallel_corpus): | |
counts = Model3Counts() | |
for aligned_sentence in parallel_corpus: | |
l = len(aligned_sentence.mots) | |
m = len(aligned_sentence.words) | |
# Sample the alignment space | |
sampled_alignments, best_alignment = self.sample(aligned_sentence) | |
# Record the most probable alignment | |
aligned_sentence.alignment = Alignment( | |
best_alignment.zero_indexed_alignment() | |
) | |
# E step (a): Compute normalization factors to weigh counts | |
total_count = self.prob_of_alignments(sampled_alignments) | |
# E step (b): Collect counts | |
for alignment_info in sampled_alignments: | |
count = self.prob_t_a_given_s(alignment_info) | |
normalized_count = count / total_count | |
for j in range(1, m + 1): | |
counts.update_lexical_translation( | |
normalized_count, alignment_info, j | |
) | |
counts.update_distortion(normalized_count, alignment_info, j, l, m) | |
counts.update_null_generation(normalized_count, alignment_info) | |
counts.update_fertility(normalized_count, alignment_info) | |
# M step: Update probabilities with maximum likelihood estimates | |
# If any probability is less than MIN_PROB, clamp it to MIN_PROB | |
existing_alignment_table = self.alignment_table | |
self.reset_probabilities() | |
self.alignment_table = existing_alignment_table # don't retrain | |
self.maximize_lexical_translation_probabilities(counts) | |
self.maximize_distortion_probabilities(counts) | |
self.maximize_fertility_probabilities(counts) | |
self.maximize_null_generation_probabilities(counts) | |
def maximize_distortion_probabilities(self, counts): | |
MIN_PROB = IBMModel.MIN_PROB | |
for j, i_s in counts.distortion.items(): | |
for i, src_sentence_lengths in i_s.items(): | |
for l, trg_sentence_lengths in src_sentence_lengths.items(): | |
for m in trg_sentence_lengths: | |
estimate = ( | |
counts.distortion[j][i][l][m] | |
/ counts.distortion_for_any_j[i][l][m] | |
) | |
self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB) | |
def prob_t_a_given_s(self, alignment_info): | |
""" | |
Probability of target sentence and an alignment given the | |
source sentence | |
""" | |
src_sentence = alignment_info.src_sentence | |
trg_sentence = alignment_info.trg_sentence | |
l = len(src_sentence) - 1 # exclude NULL | |
m = len(trg_sentence) - 1 | |
p1 = self.p1 | |
p0 = 1 - p1 | |
probability = 1.0 | |
MIN_PROB = IBMModel.MIN_PROB | |
# Combine NULL insertion probability | |
null_fertility = alignment_info.fertility_of_i(0) | |
probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) | |
if probability < MIN_PROB: | |
return MIN_PROB | |
# Compute combination (m - null_fertility) choose null_fertility | |
for i in range(1, null_fertility + 1): | |
probability *= (m - null_fertility - i + 1) / i | |
if probability < MIN_PROB: | |
return MIN_PROB | |
# Combine fertility probabilities | |
for i in range(1, l + 1): | |
fertility = alignment_info.fertility_of_i(i) | |
probability *= ( | |
factorial(fertility) * self.fertility_table[fertility][src_sentence[i]] | |
) | |
if probability < MIN_PROB: | |
return MIN_PROB | |
# Combine lexical and distortion probabilities | |
for j in range(1, m + 1): | |
t = trg_sentence[j] | |
i = alignment_info.alignment[j] | |
s = src_sentence[i] | |
probability *= ( | |
self.translation_table[t][s] * self.distortion_table[j][i][l][m] | |
) | |
if probability < MIN_PROB: | |
return MIN_PROB | |
return probability | |
class Model3Counts(Counts): | |
""" | |
Data object to store counts of various parameters during training. | |
Includes counts for distortion. | |
""" | |
def __init__(self): | |
super().__init__() | |
self.distortion = defaultdict( | |
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) | |
) | |
self.distortion_for_any_j = defaultdict( | |
lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) | |
) | |
def update_distortion(self, count, alignment_info, j, l, m): | |
i = alignment_info.alignment[j] | |
self.distortion[j][i][l][m] += count | |
self.distortion_for_any_j[i][l][m] += count | |