% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Zeyer:862906,
author = {Zeyer, Albert},
othercontributors = {Ney, Hermann and Watanabe, Shinji and Leibe, Bastian},
title = {{N}eural network based modeling and architectures for
automatic speech recognition and machine translation},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2023-00619},
pages = {1 Online-Ressource : Illustrationen},
year = {2022},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University 2023; Dissertation, RWTH Aachen University, 2022},
abstract = {Our work aims to advance the field and application of
neural networks, to advance sequence-to-sequence
architectures by extending and developing new approaches,
and to advance training methods. We perform the first
comprehensive study of long short-term memory (LSTM)
acoustic models and improve over our feed-forward neural
network (FFNN) baseline by $16\%$ relative. We are among the
first to apply bidirectional LSTMs (BLSTMs) for online
recognition. We successfully train convolutional neural
network (CNN) models (ResNet and layer-wise context
expansion with attention (LACE)) which are competitive with
our BLSTM model. We are the first to compare different
layer-normalized (LN) LSTM variants, to perform direct and
comprehensive studies, and to study the effect on training
stability, convergence and variance. We get improvements of
$10\%$ relative over the standard LSTM baseline. We further
perform a comprehensive study on Transformer models in
comparison to LSTMs, and we study Transformer language
models and reach state-of-the-art results with $6\%$
relative improvements over the best LSTM. We aim to advance
the status quo which is the hybrid neural network
(NN)-hidden Markov model (HMM) by investigating alternative
sequence-to-sequence architectures such as attention-based
encoder-decoder models. We develop state-of-the-art
attention-based models for machine translation and speech
recognition, operating on byte-pair encoding (BPE) subword
labels. With the motivation to introduce monotonicity and
potential streaming, we propose a simple local windowed
attention variant. We extend this work further through a
principled approach of having an explicit latent variable,
and introduce latent attention models with hard attention as
a special case, which are a novel class of segmental models.
We discover the equivalence of segmental and transducer
models, and propose a novel class of generalized and
extended transducer models, which perform and generalize
better than our attention models. We perform a comprehensive
study on all existing variants from the literature as
special cases of our generalized and extended model and show
the effectiveness of our extensions. We observe that
training strategies play the most important role in good
performance. We investigate training criteria, optimization
techniques, learning rate scheduling, pretraining,
regularization and data augmentation. We propose novel
pretraining schemes for LSTM and end-to-end models, where we
grow the depth and width of the neural network. We
investigate different types of training variance due to
randomness in the training caused by varying random seeds
and non-deterministic training algorithms. We are among the
first to observe and document the high impact of the number
of training epochs. We propose a novel generalized training
procedure for hybrid NN-HMMs where we calculate the full sum
over all alignments, and we identify connectionist temporal
classification (CTC) as a special case of this. We further
provide a mathematical analysis of the peaky behavior of
CTC, making this the first work to explain the peaky
behavior and convergence properties on a mathematical level.
We develop large parts of RETURNN as an efficient and
flexible software framework including beam search to perform
all the experiments. This framework and most of our results
and baselines are widely used among the team and beyond. All
of our work is published and all code and setups are
available online.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
pnm = {SEQCLAS - A Sequence Classification Framework for Human
Language Technology (694537)},
pid = {G:(EU-Grant)694537},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2023-00619},
url = {https://publications.rwth-aachen.de/record/862906},
}