% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Macherey:50613,
author = {Macherey, Wolfgang},
othercontributors = {Ney, Hermann},
title = {{D}iscriminative training and acoustic modeling for
automatic speech recognition},
address = {Aachen},
publisher = {Publikationsserver der RWTH Aachen University},
reportid = {RWTH-CONV-113149},
pages = {VII, 200 S. : graph. Darst.},
year = {2010},
note = {Zusammenfassung in engl. und dt. Sprache; Aachen, Techn.
Hochsch., Diss., 2010},
abstract = {Discriminative training has become an important means for
estimating model parameters in many statistical pattern
recognition tasks. While standard learning methods based on
the Maximum Likelihood criterion aim at optimizing model
parameters only class individually, discriminative
approaches benefit from taking all competing classes into
account, thus leading to enhanced class separability which
is often accompanied by reduced error rates and improved
system performance. Motivated by learning algorithms evolved
from neural networks, discriminative methods established as
training methods for classification problems such as complex
as automatic speech recognition. In this thesis, an extended
unifying approach for a class of discriminative training
criteria is suggested that, in addition to the Maximum
Mutual Information (MMI) criterion and the Minimum
Classification Error (MCE) criterion, also captures other
criteria more recently proposed as, for example, the Minimum
Word Error (MWE) criterion and the closely related Minimum
Phone Error (MPE) criterion. The new approach allows for
investigating a large number of different training criteria
within a single framework and thus to yield consistent
analytical and experimental results about their training
behavior and recognition performance. This thesis also
presents the first successful implementation of a large
scale, lattice-based MCE training. Experiments conducted on
several speech recognition corpora show that the MCE
criterion yields recognition results that are similar to or
even outperform the performance gains obtained with both the
MWE and the MPE criterion. The parameter optimization
problem is discussed for Gaussian mixture models where the
covariance matrices can be subject to arbitrary tying
schemes. The re-estimation equations as well as the choice
of the iteration constants for controlling the convergence
rate are discussed for the case that full or diagonal
covariance matrices are used. In case of full covariance
matrices, the problem of choosing the iteration constants in
the Extended Baum (EB) algorithm is shown to result in the
solution of a quadratic eigenvalue problem. Two novel
methods on setting the iteration constants are proposed that
provide faster convergence rates across different variance
tying schemes. This thesis also suggests a novel framework
that models the posterior distribution directly as a
log-linear model. The direct model follows the principle of
Maximum Entropy and can effectively be trained using the
Generalized Iterative Scaling (GIS) algorithm. Both the
direct model and its optimization via the GIS algorithm are
compared analytically and experimentally with the MMI
criterion and the EB algorithm. Finally, this thesis
presents a novel algorithm to efficiently compute and
represent the exact and unsmoothed error surface over all
sentence hypotheses that are encoded in a word lattice if
all parameter settings of a log-linear model are considered
that lie along an arbitrary line in the parameter space.
While the number of sentence hypotheses encoded in a word
lattice is exponential in the lattice size, the complexity
of the error surface is shown to be always linearly bounded
in the number of lattice arcs. This bound is independent of
the underlying error metric. Experiments were conducted on
several standardized speech recognition tasks that capture
different levels of difficulty, ranging from elementary
digit recognition (SieTill) over read speech (Wall Street
Journal and North American Business news texts) up to
broadcast news transcription tasks (Hub-4). Questions
pursued in this context address the effect that different
variance tying schemes have on the recognition performance
and to what extent increasing the model complexity affects
the performance gain of the discriminative training
procedure. All experiments were carried out in the extended,
unifying approach for a large number of different training
criteria.},
keywords = {Automatische Spracherkennung (SWD) / Maschinelles Lernen
(SWD) / Mustererkennung (SWD)},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
urn = {urn:nbn:de:hbz:82-opus-32093},
url = {https://publications.rwth-aachen.de/record/50613},
}