% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Golik:795683,
author = {Golik, Pavel},
othercontributors = {Ney, Hermann and Häb-Umbach, Reinhold},
title = {{D}ata-driven deep modeling and training for automatic
speech recognition},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
reportid = {RWTH-2020-08504},
pages = {1 Online-Ressource (xiii, 124 Seiten) : Illustrationen,
Diagramme},
year = {2020},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Dissertation, RWTH Aachen University, 2020},
abstract = {Many of today's state-of-the-art automatic speech
recognition (ASR) systems are based on hybrid hidden Markov
models (HMM) that rely on neural networks to provide
acoustic and language model probabilities. A dynamic
programming decoder then uses these probabilities to find
and output the most likely sequence of words for a given
input audio signal. As one of the prerequisites, the
acoustic model needs to be trained on a collection of
transcribed recordings: this acoustic training is the main
focus of this thesis. Commonly, the input signal is
represented as a sequence of relatively low-dimensional
feature vectors that capture the distribution of short-time
energy within a sliding analysis window. The process of
conventional feature extraction is based on time-frequency
decomposition of the one-dimensional input signal using
either Fourier transform or a bank of band pass filters. The
first part of this thesis centers around the question: to
which extent can the feature extraction be learned by the
acoustic model from transcribed audio data, without manually
specifying a pre-processing pipeline? We will show that not
only can a neural network learn to classify the HMM states
from the raw time signal, but also learn to perform the
time-frequency decomposition in its input layer. Inspired by
this finding, we will replace the fully-connected input
layer by a convolutional layer that is well suited to assist
the neural network in learning shift-invariant patterns, and
whose operation can be interpreted as filtering in time.
Such models show very similar error rates, while not yet
outperforming the hand-crafted feature extraction pipeline.
In the second part, we will investigate the objective
function that is optimized during the supervised acoustic
training. While it is commonly believed that softmax
normalization in the output layer requires the training to
minimize the negative log posterior probability of the data
(cross entropy), there is no reason to assume that it cannot
be done with other objective functions. In fact, both cross
entropy and squared error can be shown to be upper bound of
the difference between the Bayes error and the model error
(i.e. the difference that arises when the decision rule
relies on a model instead of the true probability). We will
demonstrate how a hybrid acoustic model can be trained using
squared error criterion, and its accuracy improved by
initializing it with a model pre-trained with the cross
entropy criterion. In the third part of this study, we will
investigate how i-vectors can be used for acoustic
adaptation, and show that they can help obtain a consistent
reduction of word error rate on multiple tasks.
Additionally, we will outline a careful analysis of
different integration strategies as well as of the
parameters of the i-vector extraction pipeline. Finally, in
the fourth part of this thesis we will apply these and other
methods to the task of speech recognition and keyword search
on low-resource languages. The limited amount of available
resources makes the acoustic training extremely challenging.
We will present a series of experiments performed in the
scope of the IARPA Babel project that make heavy use of
multilingual bottleneck features and explore techniques such
as semi-supervised training, graphemic pronunciation
modeling, and handling of keywords not observed during
training.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2020-08504},
url = {https://publications.rwth-aachen.de/record/795683},
}