% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Doetsch:805289,
author = {Doetsch, Patrick},
othercontributors = {Ney, Hermann and Juan, Alfons},
title = {{A}lignment models for recurrent neural networks},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
reportid = {RWTH-2020-10873},
pages = {1 Online-Ressource (xiii, 108 Seiten) : Illustrationen,
Diagramme},
year = {2020},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Dissertation, RWTH Aachen University, 2020},
abstract = {Modern recognition systems for speech and handwriting make
use of neural networks to convert the acoustic signal or
handwritten image into text. Neural networks hereby learn
the required parameters from transcribed data in a training
phase. In the beginning only feed-forward neural networks
were used, which had to be initialized with the alignment of
observations and labels of a previously trained Gaussian
hidden Markov model for good performance. More recently,
recurrent neural network architectures have been shown to
outperform their non-recurrent counterparts, with Long
Short-Term Memories being the most prominent example.
Recurrent neural networks can model the temporal nature of
the data directly, and thus are able to dynamically change
the alignment to better fit the model. In this thesis, we
will investigate applications and training techniques of
recurrent neural network architectures for speech and
handwriting recognition. As part of this thesis we developed
a neural network toolkit for hardware accelerated training
and recognition of speech and handwriting systems. The
software allows to train recurrent neural network
architectures as well as traditional feed-forward neural
networks and is capable of processing very large amounts of
data on multiple computing devices. After training, he
models can be loaded into the the RWTH Aachen speech
recognition toolkit for recognition. Our experiments show
that recurrent models outperform feed-forward structures in
terms of recognition error and we demonstrate their
effectiveness in various experiments on handwriting
recognition. Further contributions were made by developing
techniques to improve the training performance through
optimized data ordering. With our toolkit we then evaluate
neural network based methods for handwriting recognition.
Our focus is hereby on recurrent topologies that operate on
images either in a one-dimensional or two-dimensional
fashion, and we investigate various system architectures and
implementation techniques. We examine the effectiveness of
our proposed solutions on prominent handwriting recognition
corpora and compare our systems to other groups in a
competitive setting. In the final part of this thesis we
investigate the effects of handling the alignment problem
within recurrent neural networks. We describe overfitting
problems of conventional alignment approaches and study
properties of the connectionist temporal classification
error criterion. Furthermore, we investigate methods that do
not make use of external alignment computations, and instead
only rely on a special composition of two recurrent neural
networks that is able transcribe input observations into
output symbols directly. Motivated by these results, we
develop direct hidden Markov models as a novel inverted
alignment method, which is able to overcome some of the
limitations we noticed, and we evaluate our method on speech
and handwriting recognition tasks.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2020-10873},
url = {https://publications.rwth-aachen.de/record/805289},
}