% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Nuhn:772331,
author = {Nuhn, Malte},
othercontributors = {Ney, Hermann Josef and Juan, Alfons},
title = {{U}nsupervised training with applications in natural
language processing},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
reportid = {RWTH-2019-10638},
pages = {1 Online-Ressource (xiii, 139 Seiten) : Illustrationen,
Diagramme},
year = {2019},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University 2020; Dissertation, RWTH Aachen University, 2019},
abstract = {The state-of-the-art algorithms for various natural
language processing tasks require large amounts of labeled
training data. At the same time, obtaining labeled data of
high quality is often the most costly step in setting up
natural language processing systems. Opposed to this,
unlabeled data is much cheaper to obtain and available in
larger amounts. Currently, only few training algorithms make
use of unlabeled data. In practice, training with only
unlabeled data is not performed at all. In this thesis, we
study how unlabeled data can be used to train a variety of
models used in natural language processing. In particular,
we study models applicable to solving substitution ciphers,
spelling correction, and machine translation. This thesis
lays the groundwork for unsupervised training by presenting
and analyzing the corresponding models and unsupervised
training problems in a consistent manner. We show that the
unsupervised training problem that occurs when breaking
one-to-one substitution ciphers is equivalent to the
quadratic assignment problem (QAP) if a bigram language
model is incorporated and therefore NP-hard. Based on this
analysis, we present an effective algorithm for unsupervised
training for deterministic substitutions. In the case of
English one-to-one substitution ciphers, we show that our
novel algorithm achieves results close to human performance,
as presented in [Shannon 49]. Also, with this algorithm, we
present, to the best of our knowledge, the first automatic
decipherment of the second part of the Beale ciphers.
Further, for the task of spelling correction, we work out
the details of the EM algorithm [Dempster $\&$ Laird+ 77]
and experimentally show that the error rates achieved using
purely unsupervised training reach those of supervised
training. For handling large vocabularies, we introduce a
novel model initialization as well as multiple training
procedures that significantly speed up training without
hurting the performance of the resulting models
significantly. By incorporating an alignment model, we
further extend this model such that it can be applied to the
task of machine translation. We show that the true lexical
and alignment model parameters can be learned without any
labeled data: We experimentally show that the corresponding
likelihood function attains its maximum for the true model
parameters if a sufficient amount of unlabeled data is
available. Further, for the problem of spelling correction
with symbol substitutions and local swaps, we also show
experimentally that the performance achieved with purely
unsupervised EM training reaches that of supervised
training. Finally, using the methods developed in this
thesis, we present results on an unsupervised training task
for machine translation with a ten times larger vocabulary
than that of tasks investigated in previous work.},
cin = {122010 / 120000},
ddc = {004},
cid = {$I:(DE-82)122010_20140620$ / $I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2019-10638},
url = {https://publications.rwth-aachen.de/record/772331},
}