% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Psarras:945184,
author = {Psarras, Christos},
othercontributors = {Bientinesi, Paolo and van de Geijn, Robert and Morrison,
Abigail Joanna Rhodes},
title = {{E}valuating and improving upon the use of high-performance
libraries in linear and multi-linear algebra},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2023-01466},
pages = {1 Online-Ressource : Diagramme},
year = {2023},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Dissertation, RWTH Aachen University, 2023},
abstract = {This dissertation aims to improve the process of
translating (mapping) expressions containing matrices and
tensors to high-performance code and thus contributes to the
linear and multi-linear algebra domains, respectively. The
two domains are examined separately. On the one hand, in the
linear algebra domain, standardized building-block libraries
(BLAS/LAPACK) have long existed, offering a set of highly
optimized functions called kernels. The existence of those
libraries has made the task of evaluating mathematical
expressions containing matrices equivalent to mapping those
expressions to a sequence of kernel invocations. However,
end-users are progressively less likely to go through the
time-consuming process of directly using said kernels;
instead, languages and libraries, which offer a higher level
of abstraction, are becoming increasingly popular. These
languages and libraries offer mechanisms that internally map
an input program to lower level kernels. Unfortunately, our
experience suggests that, in terms of performance, this
mapping is typically suboptimal. In this dissertation, we
formally define the problem of mapping a linear algebra
expression to a set of available building blocks as the
"Linear Algebra Mapping Problem" (LAMP). Then, we
investigate how effectively a benchmark of test problems is
solved by popular high-level programming languages and
libraries. Specifically, we consider Matlab, Octave, Julia,
R, Armadillo (C++), Eigen (C++), and NumPy (Python); the
benchmark tests both compiler optimizations, as well as
linear algebra specific optimizations, such as the optimal
parenthesization of matrix products. Finally, we provide
guidelines to language developers and end-users for how to
prioritize different optimizations when solving a LAMP. On
the other hand, in the multi-linear algebra domain, the
absence of standardized, building-block libraries has led to
significant replication of effort when it comes to software
development. While we point out the benefit that such
libraries would provide to a number of scientific fields, at
the same time we identify and focus on cases where such a
library-based approach would not suffice. Specifically, we
investigate two such cases in one of the most prominent
fields in multi-linear algebra, chemometrics: the Canonical
Polyadic (CP) tensor decomposition and jackknife resampling,
a statistical method used to determine uncertainties of CP
decompositions.For the first case, the CP decomposition, a
broadly used method for computing it relies on the
Alternating Least Squares (ALS) algorithm. When the number
of components is small, regardless of its implementation,
ALS exhibits low arithmetic intensity, which severely
hinders its performance and makes GPU offloading
ineffective. We observe that, in practice, experts often
have to compute multiple decompositions of the same tensor,
each with a small number of components (typically fewer than
20). Leveraging this observation, we illustrate how multiple
decompositions of the same tensor can be fused together at
the algorithmic level to increase the arithmetic intensity.
Therefore, both CPUs and GPUs can be utilized more
efficiently for further speedups; at the same time the
technique is compatible with many enhancements typically
used in ALS, such as line search, and non-negativity
constraints. We introduce the Concurrent ALS (CALS)
algorithm and library, which offers an interface to Matlab,
and a mechanism to effectively deal with the issue that
decompositions complete at different times. Experimental
results on artificial and real datasets demonstrate that the
increased arithmetic intensity results in a shorter time to
completion.For the second case, we examine jackknife
resampling. Upon observation that the jackknife resampled
tensors, though different, are nearly identical, we show
that it is possible to extend the CALS technique to a
jackknife resampling scenario. This extension gives access
to the computational efficiency advantage of CALS for the
price of a modest increase (typically a few percent) in the
number of floating point operations. Numerical experiments
on both synthetic and real-world datasets demonstrate that
the proposed workflow can be several times faster than a
straightforward workflow where the jackknife submodels are
processed individually.},
cin = {124920 / 120000},
ddc = {004},
cid = {$I:(DE-82)124920_20200227$ / $I:(DE-82)120000_20140620$},
pnm = {GRK 2379 - GRK 2379: Moderne Inverse Probleme: Von
Geometrie über Daten hin zu Modellen und Anwendungen
(333849990)},
pid = {G:(GEPRIS)333849990},
typ = {PUB:(DE-HGF)11},
doi = {10.18154/RWTH-2023-01466},
url = {https://publications.rwth-aachen.de/record/945184},
}