@article{2026.TACO.Park,
  author = {Park, DaOn and Egger, Bernhard},
  title = {CPU-GPU Workload Distribution during Throughput-Oriented LLM Inference on Single-GPU Systems},
  journal = {ACM Transactions on Architecture and Code Optimization},
  year = {2026},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  issn = {1544-3566},
  url = {https://doi.org/10.1145/3816433},
  doi = {10.1145/3816433},
  note = {Just Accepted},
  month = {May},
  keywords = {CPU offloading, Throughput-latency tradeoff, Large language model generation, CPU-GPU workload distribution, Single commodity-GPU systems}
}