% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Wang:956960,
author = {Wang, Weiyue},
othercontributors = {Ney, Hermann and van Genabith, Josef and Hoos, Holger
Hendrik},
title = {{N}eural hidden {M}arkov model for machine translation},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2023-04470},
pages = {1 Online-Ressource : Illustrationen},
year = {2023},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Dissertation, RWTH Aachen University, 2023},
abstract = {Recently, neural machine translation systems have shown
promising performance, outperforming phrase-based systems,
which were the state-of-the-art in statistical machine
translation for more than 10 years. Regardless of whether a
recurrent neural network with long short-term memory or a
convolutional neural network or a self-attentive transformer
network is used, the attention mechanism is always one of
the key components that all modern neural machine
translation systems contain. In this work, we propose a
completely novel neural architecture for machine
translation, which is referred to as a direct hidden Markov
model, as an alternative to attention-based systems.An
attention component helps an encoder-decoder model attend to
specific positions on the source side to produce a
translation. In this way, the translation performance is
significantly improved. This mechanism captures the
correspondence between the source and target hidden states,
and has a similar functionality to an alignment model in a
phrase-based machine translation system. However, recent
studies have found that using attention weights straight out
of the box to align words results in poor alignment quality.
This inspires us to introduce an explicit alignment model
into the neural architecture in order to improve the
alignment and thus also the translation quality of the
overall system. To this end, we propose to use the concept
of the hidden Markov model from statistical machine
translation, which is made up of a lexicon model and an
alignment model. In the neural hidden Markov model, the
lexicon and alignment probabilities are modeled by neural
networks, and the alignment is modeled from target to source
direction so that it can be used directly in the forward
translation as an attention component. The two models are
trained jointly with the forward-backward algorithm and the
end-to-end training process of a neural machine translation
system is not violated. Various neural network architectures
can be used to model the lexicon and the alignment
probabilities. We start with simple feedforward neural
networks and apply our first model to re-rank n-best lists
generated by phrase-based systems and observe significant
improvements. In order to build a monolithic direct hidden
Markov model, the more powerful recurrent neural networks
with long short-term memory are applied to the architecture,
and a standalone decoder is implemented. By replacing the
attention mechanism with a first-order alignment model, we
achieve comparable performance to the baseline attention
model while significantly improving the alignment quality.
The improvements in alignment quality do not lead to an
improvement in translation performance, one possible reason
is that the attention mechanism is aimed at finding a
relevant context for estimating the next target word and
therefore source words with high attention weights are not
necessarily translation equivalents of the target word. To
keep pace with the development of neural machine
translation, we also study the possibility of applying the
transformer architecture to the direct hidden Markov model.
Contrary to the recurrent neural network case, here we do
not completely replace the attention mechanism with the
alignment model. Instead, we combine the alignment
information obtained by the hidden Markov model
factorization with the attention mechanism. The experimental
results show that applying the concept of the direct hidden
Markov model significantly improves the performance of the
state-of-the-art self-attentive transformer architecture in
terms of TER and CHARACTER scores. In addition to the work
on the direct hidden Markov model, we propose two novel
metrics for machine translation evaluation, called CHARACTER
and EED. These are easy-to-use and publicly available to
everyone. They perform promisingly in the annual WMT metrics
shared tasks and are always among the front runners.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2023-04470},
url = {https://publications.rwth-aachen.de/record/956960},
}