h1

h2

h3

h4

h5
h6
% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@PHDTHESIS{Golik:795683,
      author       = {Golik, Pavel},
      othercontributors = {Ney, Hermann and Häb-Umbach, Reinhold},
      title        = {{D}ata-driven deep modeling and training for automatic
                      speech recognition},
      school       = {RWTH Aachen University},
      type         = {Dissertation},
      address      = {Aachen},
      reportid     = {RWTH-2020-08504},
      pages        = {1 Online-Ressource (xiii, 124 Seiten) : Illustrationen,
                      Diagramme},
      year         = {2020},
      note         = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
                      University; Dissertation, RWTH Aachen University, 2020},
      abstract     = {Many of today's state-of-the-art automatic speech
                      recognition (ASR) systems are based on hybrid hidden Markov
                      models (HMM) that rely on neural networks to provide
                      acoustic and language model probabilities. A dynamic
                      programming decoder then uses these probabilities to find
                      and output the most likely sequence of words for a given
                      input audio signal. As one of the prerequisites, the
                      acoustic model needs to be trained on a collection of
                      transcribed recordings: this acoustic training is the main
                      focus of this thesis. Commonly, the input signal is
                      represented as a sequence of relatively low-dimensional
                      feature vectors that capture the distribution of short-time
                      energy within a sliding analysis window. The process of
                      conventional feature extraction is based on time-frequency
                      decomposition of the one-dimensional input signal using
                      either Fourier transform or a bank of band pass filters. The
                      first part of this thesis centers around the question: to
                      which extent can the feature extraction be learned by the
                      acoustic model from transcribed audio data, without manually
                      specifying a pre-processing pipeline? We will show that not
                      only can a neural network learn to classify the HMM states
                      from the raw time signal, but also learn to perform the
                      time-frequency decomposition in its input layer. Inspired by
                      this finding, we will replace the fully-connected input
                      layer by a convolutional layer that is well suited to assist
                      the neural network in learning shift-invariant patterns, and
                      whose operation can be interpreted as filtering in time.
                      Such models show very similar error rates, while not yet
                      outperforming the hand-crafted feature extraction pipeline.
                      In the second part, we will investigate the objective
                      function that is optimized during the supervised acoustic
                      training. While it is commonly believed that softmax
                      normalization in the output layer requires the training to
                      minimize the negative log posterior probability of the data
                      (cross entropy), there is no reason to assume that it cannot
                      be done with other objective functions. In fact, both cross
                      entropy and squared error can be shown to be upper bound of
                      the difference between the Bayes error and the model error
                      (i.e. the difference that arises when the decision rule
                      relies on a model instead of the true probability). We will
                      demonstrate how a hybrid acoustic model can be trained using
                      squared error criterion, and its accuracy improved by
                      initializing it with a model pre-trained with the cross
                      entropy criterion. In the third part of this study, we will
                      investigate how i-vectors can be used for acoustic
                      adaptation, and show that they can help obtain a consistent
                      reduction of word error rate on multiple tasks.
                      Additionally, we will outline a careful analysis of
                      different integration strategies as well as of the
                      parameters of the i-vector extraction pipeline. Finally, in
                      the fourth part of this thesis we will apply these and other
                      methods to the task of speech recognition and keyword search
                      on low-resource languages. The limited amount of available
                      resources makes the acoustic training extremely challenging.
                      We will present a series of experiments performed in the
                      scope of the IARPA Babel project that make heavy use of
                      multilingual bottleneck features and explore techniques such
                      as semi-supervised training, graphemic pronunciation
                      modeling, and handling of keywords not observed during
                      training.},
      cin          = {122010 / 120000},
      ddc          = {004},
      cid          = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
      typ          = {PUB:(DE-HGF)11},
      doi          = {10.18154/RWTH-2020-08504},
      url          = {https://publications.rwth-aachen.de/record/795683},
}