% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@MASTERSTHESIS{Mainka:1009964,
author = {Mainka, Irmin},
othercontributors = {Müller, Matthias S. and Kunkel, Julian and Viehhauser,
Dominik},
title = {{E}valuierung von {O}ptimierungsstrategien zur
{D}atensatzspeicherung für machinelles {L}ernen auf {HPC}
{S}ystemen},
school = {RWTH Aachen University},
type = {Bachelorarbeit},
address = {Aachen},
publisher = {RWTH Aachen University},
reportid = {RWTH-2025-03758},
pages = {1 Online-Ressource : Illustrationen},
year = {2025},
note = {Veröffentlicht auf dem Publikationsserver der RWTH Aachen
University; Bachelorarbeit, RWTH Aachen University, 2025},
abstract = {Traditional Machine Learning Datasets used to train models
are often used in aform consisting of a large amount of
small files. This property is detrimental totheir widespread
use on HPC systems due to the way parallel filesystems
work.Several other ways to store such datasets can be found
in the areas of both HPCand Python programming. Strategies
for both storing and loading datasets aretested in
experiments in this thesis. These experiments focus on
training an ImageClassification model. The strategies used
in this thesis include the usage of numpyarrays, LMDB, HDF5
and Zarr. The results are then used to evaluate how
thedifferent strategies compare to each other. The goal of
this thesis is to either finda performant strategy using
fewer files or validate the usage of the strategy usingmany
small files.},
cin = {123010 / 022000 / 120000},
ddc = {004},
cid = {$I:(DE-82)123010_20140620$ / $I:(DE-82)022000_20140101$ /
$I:(DE-82)120000_20140620$},
typ = {PUB:(DE-HGF)2},
doi = {10.18154/RWTH-2025-03758},
url = {https://publications.rwth-aachen.de/record/1009964},
}