@inproceedings{10.1145/3642970.3655824,

author = {Nouaji, Rahma and Bitchebe, Stella and Balmau, Oana},

title = {SpeedyLoader: Efficient Pipelining of Data Preprocessing and Machine Learning Training},

year = {2024},

isbn = {9798400705410},

publisher = {Association for Computing Machinery},

address = {New York, NY, USA},

url = {https://doi.org/10.1145/3642970.3655824},

doi = {10.1145/3642970.3655824},

abstract = {Data preprocessing consisting of tasks like sample resizing, cropping, and filtering, is a crucial step in machine learning (ML) workflows. Even though the preprocessing step is largely ignored by work that focuses on optimizing training algorithms, in practice for many workloads preprocessing and training are pipelined. Popular ML frameworks like PyTorch use data loaders to feed data into model training. If the pipeline between preprocessing and training is not done carefully, it can cause significant waiting times on the GPU side. To address this limitation, we introduce SpeedyLoader, a system that overlaps preprocessing and training by leveraging asynchronous data preprocessing and avoiding head-of-line blocking. SpeedyLoader incorporates dedicated data loading threads, which organize preprocessed samples into queues based on their predicted processing times. Concurrently, GPUs fetch samples from these queues, ensuring training is not impeded by preprocessing completion. Compared to the default PyTorch DataLoader, SpeedyLoader reduces training time by up to 30\% and increases GPU usage by 4.3\texttimes{}, all while maintaining a consistent evaluation accuracy of 91\%.},

booktitle = {Proceedings of the 4th Workshop on Machine Learning and Systems},

pages = {65–72},

numpages = {8},

keywords = {Data preprocessing, Dataloader, GPU-CPU overlap, Machine learning, Pipelining, Training},

location = {<conf-loc>, <city>Athens</city>, <country>Greece</country>, </conf-loc>},

series = {EuroMLSys '24}

}