% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Wbker:696036,
author = {Wübker, Jörn},
othercontributors = {Ney, Hermann and van Genabith, Josef},
title = {{E}ffective training and efficient decoding for statistical
machine translation},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
reportid = {RWTH-2017-06573},
pages = {1 Online-Ressource (xi, 120 Seiten) : Illustrationen,
Diagramme},
year = {2017},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Dissertation, RWTH Aachen University, 2017},
abstract = {Statistical machine translation, the task of translating
text from one natural language into another using
statistical models, can be divided into three main problems:
modeling, search and training. This thesis gives a detailed
description of the most popular approach to statistical
machine translation, the phrase-based paradigm, and presents
several improvements to the state of the art in all three of
the aspects mentioned above. Regarding the search problem,
we propose three novel language model look-ahead techniques
which can considerably increase time efficiency of the
algorithm with different quality tradeoffs. They are
evaluated in detail with respect to their effect on
translation quality, translation speed, number of language
model queries and number of generated nodes within the
search graph. We can show that our final system outperforms
the popular Moses toolkit in terms of translation speed.
With regard to the modeling problem we extend the state of
the art with novel smoothing models based on word classes.
Data sparsity is a common pitfall for statistical models. We
leverage word classes that can be learned in an unsupervised
fashion in order to re-parameterize the standard
phrase-based models, resulting in a smoother probability
distribution and reduced sparsity. The largest part of this
work is dedicated to the training problem. We investigate
both generative and discriminative training methods, two
fundamentally different approaches to learning statistical
models. Our generative procedure is inspired by the
expectation-maximization algorithm and based on
force-aligning the training data with the application of the
leave-one-out technique to avoid overfitting. Its advantage
over the standard heuristic model extraction is that it
provides a framework which uses the same consistent models
in training and search. The initial technique is further
developed into a length-incremental procedure which does not
require initialization with a Viterbi word alignment and is
thus not biased by its inconsistencies. Both the learning
procedure and the resulting models are analyzed in detail.
As a discriminative training procedure, we employ a
gradient-based method to optimize an expected BLEU objective
function. Our novel contribution is the application of the
resilient backpropagation algorithm, which is experimentally
shown to be superior to several previously proposed
techniques. It is also significantly more time and memory
efficient than previous work, so that we can run training on
the largest data set reported in the literature to date. Our
novel techniques are experimentally evaluated against
internal and external results on large-scale translation
tasks and within public evaluation campaigns. Especially the
word class language model and discriminative training
procedure prove to be valuable for state-of-the-art large
scale translation systems.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2017-06573},
url = {https://publications.rwth-aachen.de/record/696036},
}