% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@MASTERSTHESIS{Brckner:955544,
author = {Brückner, Moritz},
othercontributors = {Müller, Matthias S. and Geisler, Sandra and Liem, Radita
Tapaning Hesti},
title = {{P}erformance analysis using {POP} methodology in spark big
data applications},
school = {RWTH Aachen University},
type = {Bachelorarbeit},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2023-03557},
pages = {1 Online-Ressource : Illustrationen, Diagramme},
year = {2023},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Bachelorarbeit, RWTH Aachen University, 2023},
abstract = {Today’s software applications need to cope with ever
increasing amounts of data while processing the data in a
reasonable amount of time with limited resources.
Specialized frameworks such as Apache Hadoop or Apache Spark
are often used to meet those requirements, making it
possible to run an application in a distributed and parallel
manner on multiple compute nodes in a cluster network. A
common issue with these frameworks is that both the
configuration of an applications as well as the kind of
application and the structure of its data are strongly
influencing the application’s performance. In addition to
that, there seems to be a current trend of convergence of
the originally largely independent disciplines of
high-performance computing (HPC) and big data, whose
applications increasingly overlap. As a result, the
application of Apache Spark on HPC systems is gaining
relevance and, consequently, also the study of performance
of Spark applications on these systems. In this thesis, the
POP methodology, originally developed for analyzing the
performance of HPC applications, is applied to Spark big
data applications. The core principle of the methodology is
to assign a score to individual performance-influencing
aspects, which can be used to obtain a comprehensive and
direct overview of potential performance bottlenecks of an
application. The aim of this thesis is to evaluate selected
Spark benchmarks from the HiBench benchmark suite and to use
the obtained results to derive POP metrics for Spark
applications. Beyond the POP metrics that are used in the
HPC context, additional Spark-specific metrics are proposed
in order to significantly extend the range of identifiable
problems and to allow for a more precise determination of
these problems. This thesis comes to the conclusion that, in
principle, the POP methodology can be successfully applied
to Spark applications, although in some cases certain
limitations or assumptions are necessary. Even though it is
not possible to verify the correctness and completeness of
the proposed metrics beyond any doubt by means of the
conducted experiments, the methodology presented in this
thesis seems to be suitable for identifying a large number
of different performance problems. Yet, further
investigations are required in order to eliminate some of
the limitations and assumptions made, and to improve and
validate both individual metrics as well as the methodology
as a whole.},
cin = {123010 / 022000 / 120000},
ddc = {004},
cid = {$I:(DE-82)123010_20140620$ / $I:(DE-82)022000_20140101$ /
$I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)2},
doi = {10.18154/RWTH-2023-03557},
url = {https://publications.rwth-aachen.de/record/955544},
}