{"id":"https://openalex.org/W4407196897","doi":"https://doi.org/10.1145/3669940.3707265","title":"Forecasting GPU Performance for Deep Learning Training and Inference","display_name":"Forecasting GPU Performance for Deep Learning Training and Inference","publication_year":2025,"publication_date":"2025-02-06","ids":{"openalex":"https://openalex.org/W4407196897","doi":"https://doi.org/10.1145/3669940.3707265"},"language":"en","primary_location":{"id":"doi:10.1145/3669940.3707265","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3669940.3707265","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3669940.3707265","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061941075","display_name":"Seonho Lee","orcid":"https://orcid.org/0000-0002-0035-1359"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Seonho Lee","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011102459","display_name":"Amar Phanishayee","orcid":"https://orcid.org/0009-0001-2777-1118"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Amar Phanishayee","raw_affiliation_strings":["Meta, Seattle, WA, USA"],"affiliations":[{"raw_affiliation_string":"Meta, Seattle, WA, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089590312","display_name":"Divya Mahajan","orcid":"https://orcid.org/0009-0007-8184-0528"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Divya Mahajan","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5061941075"],"corresponding_institution_ids":["https://openalex.org/I130701444"],"apc_list":null,"apc_paid":null,"fwci":36.6632,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.99820353,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"493","last_page":"508"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.775164008140564},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6936302185058594},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6500076651573181},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.637366533279419},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.607830286026001},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4967084527015686}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.775164008140564},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6936302185058594},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6500076651573181},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.637366533279419},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.607830286026001},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4967084527015686},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3669940.3707265","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3669940.3707265","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3669940.3707265","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3669940.3707265","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W2002555321","https://openalex.org/W2038666141","https://openalex.org/W2152513418","https://openalex.org/W2167334577","https://openalex.org/W2239144794","https://openalex.org/W2612541187","https://openalex.org/W2618530766","https://openalex.org/W2625697499","https://openalex.org/W2883929540","https://openalex.org/W2930604630","https://openalex.org/W2945146780","https://openalex.org/W2969388332","https://openalex.org/W2980200167","https://openalex.org/W4242577057","https://openalex.org/W4281637935","https://openalex.org/W4380874786","https://openalex.org/W4388031315"],"related_works":["https://openalex.org/W2731899572","https://openalex.org/W2961085424","https://openalex.org/W3215138031","https://openalex.org/W4306674287","https://openalex.org/W3009238340","https://openalex.org/W4360585206","https://openalex.org/W4321369474","https://openalex.org/W4285208911","https://openalex.org/W4387369504","https://openalex.org/W3046775127"],"abstract_inverted_index":{"Deep":[0],"learning":[1,129,208,217,242,265,285],"kernels":[2],"exhibit":[3],"a":[4,116,124,206,211,215,239,263,281],"high":[5,180],"level":[6],"of":[7,43,53,59,90,123,127,143,167,201,205,283,307],"predictable":[8],"memory":[9],"accesses":[10],"and":[11,23,46,61,102,134,158,190,267,287,299,312,325],"compute":[12],"patterns,":[13],"making":[14],"GPU's":[15],"architecture":[16],"well-suited":[17],"for":[18,26,131,310],"their":[19],"execution.":[20],"Moreover,":[21],"software":[22,159],"runtime":[24],"system":[25],"GPUs":[27,62,80],"further":[28],"enable":[29],"optimizations":[30,161],"that":[31,172],"aim":[32],"to":[33,73,78,119,162,196,269,301,318,330],"better":[34],"utilize":[35],"the":[36,51,56,74,88,121,144,148,164,198,203,224,231,256,271,288,294,305],"stream":[37],"multiprocessors,":[38],"on-chip":[39],"bandwidth,":[40],"multiple":[41],"levels":[42],"cache":[44],"hierarchy,":[45],"off-chip":[47],"high-bandwidth":[48],"memory.":[49],"In":[50],"context":[52],"deep":[54,128,207,241,284],"learning,":[55],"entire":[57],"space":[58],"models":[60,68,98,189],"is":[63,81],"constantly":[64],"evolving,":[65],"as":[66,193],"newer":[67,79],"emerge":[69],"with":[70,220],"simultaneous":[71],"upgrades":[72],"device.":[75],"However,":[76],"access":[77],"often":[82],"limited,":[83],"raising":[84],"important":[85],"questions":[86],"about":[87],"performance":[89,122,166,186,235],"new":[91,100,103,107,191],"model":[92,104,146,197,309],"architectures":[93,105],"on":[94,99,106,136,147,187,210,255,314],"existing":[95,97],"GPUs,":[96,101,138,192],"GPUs.":[108,291],"To":[109],"address":[110],"these":[111,168],"questions,":[112],"we":[113,222],"introduce":[114],"NeuSight,":[115,221],"forecasting":[117,185],"framework":[118,152],"predict":[120],"diverse":[125],"range":[126],"models,":[130],"both":[132,154],"training":[133,311],"inference,":[135],"unseen":[137,188],"without":[139],"requiring":[140],"actual":[141],"execution":[142],"target":[145,149],"GPU.":[150,257],"The":[151],"leverages":[153],"GPU":[155,212],"hardware":[156],"behavior":[157],"library":[160],"estimate":[163,270],"end-to-end":[165,272],"models.":[169],"We":[170],"observe":[171],"prior":[173,278,320],"work":[174,279],"in":[175,303,316],"this":[176],"area":[177],"suffers":[178],"from":[179,297],"absolute":[181],"error":[182,296],"percentages":[183],"when":[184],"they":[194],"attempt":[195],"complex":[199],"task":[200],"predicting":[202,304],"latency":[204,306],"kernel":[209,243],"directly":[213],"using":[214,262],"machine":[216,264],"approach.":[218],"Instead,":[219],"decompose":[223],"prediction":[225,232,244],"into":[226,245],"smaller":[227,246],"problems,":[228],"while":[229],"bounding":[230],"through":[233],"fundamental":[234],"laws.":[236],"NeuSight":[237,276],"decomposes":[238],"single":[240],"working":[247],"sets":[248],"called":[249],"tiles,":[250],"which":[251],"are":[252,260],"executed":[253],"independently":[254],"Tile-granularity":[258],"predictions":[259],"determined":[261],"approach":[266],"aggregated":[268],"latency.":[273],"As":[274],"such,":[275],"outperforms":[277],"across":[280],"variety":[282],"workloads":[286],"most":[289],"up-to-date":[290],"It":[292],"reduces":[293],"percentage":[295],"121.4%":[298],"30.8%":[300],"2.3%":[302],"GPT3":[308,324],"inference":[313],"H100,":[315],"comparison":[317],"state-of-the-art":[319],"work,":[321],"respectively,":[322],"where":[323],"H100":[326],"were":[327],"not":[328],"used":[329],"train":[331],"any":[332],"framework.":[333]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":14}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2025-10-10T00:00:00"}
