You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
12 KiB
Python

5 years ago
# -*- coding: utf-8 -*-
# Natural Language Toolkit: IBM Model 2
#
# Copyright (C) 2001-2013 NLTK Project
# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
"""
Lexical translation model that considers word order.
IBM Model 2 improves on Model 1 by accounting for word order.
An alignment probability is introduced, a(i | j,l,m), which predicts
a source word position, given its aligned target word's position.
The EM algorithm used in Model 2 is:
E step - In the training data, collect counts, weighted by prior
probabilities.
(a) count how many times a source language word is translated
into a target language word
(b) count how many times a particular position in the source
sentence is aligned to a particular position in the target
sentence
M step - Estimate new probabilities based on the counts from the E step
Notations:
i: Position in the source sentence
Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
j: Position in the target sentence
Valid values are 1, 2, ..., length of target sentence
l: Number of words in the source sentence, excluding NULL
m: Number of words in the target sentence
s: A word in the source language
t: A word in the target language
References:
Philipp Koehn. 2010. Statistical Machine Translation.
Cambridge University Press, New York.
Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
Robert L. Mercer. 1993. The Mathematics of Statistical Machine
Translation: Parameter Estimation. Computational Linguistics, 19 (2),
263-311.
"""
from __future__ import division
import warnings
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate import IBMModel1
from nltk.translate.ibm_model import Counts
class IBMModel2(IBMModel):
"""
Lexical translation model that considers word order
>>> bitext = []
>>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
>>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
>>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
>>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
>>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
>>> ibm2 = IBMModel2(bitext, 5)
>>> print(round(ibm2.translation_table['buch']['book'], 3))
1.0
>>> print(round(ibm2.translation_table['das']['book'], 3))
0.0
>>> print(round(ibm2.translation_table['buch'][None], 3))
0.0
>>> print(round(ibm2.translation_table['ja'][None], 3))
0.0
>>> print(ibm2.alignment_table[1][1][2][2])
0.938...
>>> print(round(ibm2.alignment_table[1][2][2][2], 3))
0.0
>>> print(round(ibm2.alignment_table[2][2][4][5], 3))
1.0
>>> test_sentence = bitext[2]
>>> test_sentence.words
['das', 'buch', 'ist', 'ja', 'klein']
>>> test_sentence.mots
['the', 'book', 'is', 'small']
>>> test_sentence.alignment
Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
"""
def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
"""
Train on ``sentence_aligned_corpus`` and create a lexical
translation model and an alignment model.
Translation direction is from ``AlignedSent.mots`` to
``AlignedSent.words``.
:param sentence_aligned_corpus: Sentence-aligned parallel corpus
:type sentence_aligned_corpus: list(AlignedSent)
:param iterations: Number of iterations to run training algorithm
:type iterations: int
:param probability_tables: Optional. Use this to pass in custom
probability values. If not specified, probabilities will be
set to a uniform distribution, or some other sensible value.
If specified, all the following entries must be present:
``translation_table``, ``alignment_table``.
See ``IBMModel`` for the type and purpose of these tables.
:type probability_tables: dict[str]: object
"""
super(IBMModel2, self).__init__(sentence_aligned_corpus)
if probability_tables is None:
# Get translation probabilities from IBM Model 1
# Run more iterations of training for Model 1, since it is
# faster than Model 2
ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
self.translation_table = ibm1.translation_table
self.set_uniform_probabilities(sentence_aligned_corpus)
else:
# Set user-defined probabilities
self.translation_table = probability_tables['translation_table']
self.alignment_table = probability_tables['alignment_table']
for n in range(0, iterations):
self.train(sentence_aligned_corpus)
self.align_all(sentence_aligned_corpus)
def set_uniform_probabilities(self, sentence_aligned_corpus):
# a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
l_m_combinations = set()
for aligned_sentence in sentence_aligned_corpus:
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
if (l, m) not in l_m_combinations:
l_m_combinations.add((l, m))
initial_prob = 1 / (l + 1)
if initial_prob < IBMModel.MIN_PROB:
warnings.warn(
"A source sentence is too long ("
+ str(l)
+ " words). Results may be less accurate."
)
for i in range(0, l + 1):
for j in range(1, m + 1):
self.alignment_table[i][j][l][m] = initial_prob
def train(self, parallel_corpus):
counts = Model2Counts()
for aligned_sentence in parallel_corpus:
src_sentence = [None] + aligned_sentence.mots
trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
l = len(aligned_sentence.mots)
m = len(aligned_sentence.words)
# E step (a): Compute normalization factors to weigh counts
total_count = self.prob_all_alignments(src_sentence, trg_sentence)
# E step (b): Collect counts
for j in range(1, m + 1):
t = trg_sentence[j]
for i in range(0, l + 1):
s = src_sentence[i]
count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
normalized_count = count / total_count[t]
counts.update_lexical_translation(normalized_count, s, t)
counts.update_alignment(normalized_count, i, j, l, m)
# M step: Update probabilities with maximum likelihood estimates
self.maximize_lexical_translation_probabilities(counts)
self.maximize_alignment_probabilities(counts)
def maximize_alignment_probabilities(self, counts):
MIN_PROB = IBMModel.MIN_PROB
for i, j_s in counts.alignment.items():
for j, src_sentence_lengths in j_s.items():
for l, trg_sentence_lengths in src_sentence_lengths.items():
for m in trg_sentence_lengths:
estimate = (
counts.alignment[i][j][l][m]
/ counts.alignment_for_any_i[j][l][m]
)
self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
def prob_all_alignments(self, src_sentence, trg_sentence):
"""
Computes the probability of all possible word alignments,
expressed as a marginal distribution over target words t
Each entry in the return value represents the contribution to
the total alignment probability by the target word t.
To obtain probability(alignment | src_sentence, trg_sentence),
simply sum the entries in the return value.
:return: Probability of t for all s in ``src_sentence``
:rtype: dict(str): float
"""
alignment_prob_for_t = defaultdict(lambda: 0.0)
for j in range(1, len(trg_sentence)):
t = trg_sentence[j]
for i in range(0, len(src_sentence)):
alignment_prob_for_t[t] += self.prob_alignment_point(
i, j, src_sentence, trg_sentence
)
return alignment_prob_for_t
def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
"""
Probability that position j in ``trg_sentence`` is aligned to
position i in the ``src_sentence``
"""
l = len(src_sentence) - 1
m = len(trg_sentence) - 1
s = src_sentence[i]
t = trg_sentence[j]
return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
def prob_t_a_given_s(self, alignment_info):
"""
Probability of target sentence and an alignment given the
source sentence
"""
prob = 1.0
l = len(alignment_info.src_sentence) - 1
m = len(alignment_info.trg_sentence) - 1
for j, i in enumerate(alignment_info.alignment):
if j == 0:
continue # skip the dummy zeroeth element
trg_word = alignment_info.trg_sentence[j]
src_word = alignment_info.src_sentence[i]
prob *= (
self.translation_table[trg_word][src_word]
* self.alignment_table[i][j][l][m]
)
return max(prob, IBMModel.MIN_PROB)
def align_all(self, parallel_corpus):
for sentence_pair in parallel_corpus:
self.align(sentence_pair)
def align(self, sentence_pair):
"""
Determines the best word alignment for one sentence pair from
the corpus that the model was trained on.
The best alignment will be set in ``sentence_pair`` when the
method returns. In contrast with the internal implementation of
IBM models, the word indices in the ``Alignment`` are zero-
indexed, not one-indexed.
:param sentence_pair: A sentence in the source language and its
counterpart sentence in the target language
:type sentence_pair: AlignedSent
"""
best_alignment = []
l = len(sentence_pair.mots)
m = len(sentence_pair.words)
for j, trg_word in enumerate(sentence_pair.words):
# Initialize trg_word to align with the NULL token
best_prob = (
self.translation_table[trg_word][None]
* self.alignment_table[0][j + 1][l][m]
)
best_prob = max(best_prob, IBMModel.MIN_PROB)
best_alignment_point = None
for i, src_word in enumerate(sentence_pair.mots):
align_prob = (
self.translation_table[trg_word][src_word]
* self.alignment_table[i + 1][j + 1][l][m]
)
if align_prob >= best_prob:
best_prob = align_prob
best_alignment_point = i
best_alignment.append((j, best_alignment_point))
sentence_pair.alignment = Alignment(best_alignment)
class Model2Counts(Counts):
"""
Data object to store counts of various parameters during training.
Includes counts for alignment.
"""
def __init__(self):
super(Model2Counts, self).__init__()
self.alignment = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
)
self.alignment_for_any_i = defaultdict(
lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
)
def update_lexical_translation(self, count, s, t):
self.t_given_s[t][s] += count
self.any_t_given_s[s] += count
def update_alignment(self, count, i, j, l, m):
self.alignment[i][j][l][m] += count
self.alignment_for_any_i[j][l][m] += count