{"id":"https://openalex.org/W3142875983","doi":"https://doi.org/10.1145/3484505","title":"GPU Domain Specialization via Composable On-Package Architecture","display_name":"GPU Domain Specialization via Composable On-Package Architecture","publication_year":2021,"publication_date":"2021-12-06","ids":{"openalex":"https://openalex.org/W3142875983","doi":"https://doi.org/10.1145/3484505","mag":"3142875983"},"language":"en","primary_location":{"id":"doi:10.1145/3484505","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3484505","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3484505","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3484505","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010195820","display_name":"Yaosheng Fu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yaosheng Fu","raw_affiliation_strings":["NVIDIA, Santa Clara, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034001619","display_name":"Evgeny Bolotin","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Evgeny Bolotin","raw_affiliation_strings":["NVIDIA, Santa Clara, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039009740","display_name":"Niladrish Chatterjee","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Niladrish Chatterjee","raw_affiliation_strings":["NVIDIA, Santa Clara, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031781240","display_name":"David Nellans","orcid":"https://orcid.org/0000-0001-5203-8367"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"David Nellans","raw_affiliation_strings":["NVIDIA, Santa Clara, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063354509","display_name":"Stephen W. Keckler","orcid":"https://orcid.org/0000-0001-6701-6099"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stephen W. Keckler","raw_affiliation_strings":["NVIDIA, Santa Clara, CA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210127875"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.7119,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.69027772,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":"19","issue":"1","first_page":"1","last_page":"23"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8683769702911377},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.724843442440033},{"id":"https://openalex.org/keywords/dram","display_name":"Dram","score":0.6279135346412659},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5911833047866821},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.4975414574146271},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4754352569580078},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.4754219353199005},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.42049503326416016},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.41290855407714844},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3239218592643738},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.19227632880210876},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.1476910412311554}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8683769702911377},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.724843442440033},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.6279135346412659},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5911833047866821},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4975414574146271},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4754352569580078},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4754219353199005},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.42049503326416016},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.41290855407714844},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3239218592643738},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.19227632880210876},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1476910412311554},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3484505","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3484505","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3484505","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3484505","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3484505","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3484505","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.41999998688697815,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3142875983.pdf","grobid_xml":"https://content.openalex.org/works/W3142875983.grobid-xml"},"referenced_works_count":65,"referenced_works":["https://openalex.org/W569478347","https://openalex.org/W585658244","https://openalex.org/W1031578623","https://openalex.org/W2024122052","https://openalex.org/W2025516544","https://openalex.org/W2048266589","https://openalex.org/W2065653320","https://openalex.org/W2103325328","https://openalex.org/W2104234755","https://openalex.org/W2124031229","https://openalex.org/W2144172034","https://openalex.org/W2163691666","https://openalex.org/W2193413348","https://openalex.org/W2194775991","https://openalex.org/W2234584938","https://openalex.org/W2267635276","https://openalex.org/W2405920868","https://openalex.org/W2575705757","https://openalex.org/W2583993537","https://openalex.org/W2592428089","https://openalex.org/W2605350416","https://openalex.org/W2606722458","https://openalex.org/W2625200202","https://openalex.org/W2749199255","https://openalex.org/W2760669496","https://openalex.org/W2761710529","https://openalex.org/W2798291715","https://openalex.org/W2802000047","https://openalex.org/W2888422814","https://openalex.org/W2898566692","https://openalex.org/W2903659818","https://openalex.org/W2933483662","https://openalex.org/W2949161920","https://openalex.org/W2949207837","https://openalex.org/W2952232639","https://openalex.org/W2953212265","https://openalex.org/W2963112338","https://openalex.org/W2963341956","https://openalex.org/W2965769762","https://openalex.org/W2969766737","https://openalex.org/W2976081963","https://openalex.org/W2980104813","https://openalex.org/W2992192853","https://openalex.org/W3001279689","https://openalex.org/W3005962614","https://openalex.org/W3015213341","https://openalex.org/W3015590138","https://openalex.org/W3015690986","https://openalex.org/W3016082253","https://openalex.org/W3016212306","https://openalex.org/W3017188964","https://openalex.org/W3036878841","https://openalex.org/W3038103902","https://openalex.org/W3043522163","https://openalex.org/W3043571714","https://openalex.org/W3100944043","https://openalex.org/W3102510044","https://openalex.org/W3150050634","https://openalex.org/W3157055696","https://openalex.org/W3190062760","https://openalex.org/W3197788851","https://openalex.org/W3203992401","https://openalex.org/W4245911027","https://openalex.org/W4248104576","https://openalex.org/W4365806496"],"related_works":["https://openalex.org/W2384867379","https://openalex.org/W3120961607","https://openalex.org/W2329539859","https://openalex.org/W3191490922","https://openalex.org/W2227905990","https://openalex.org/W3148568549","https://openalex.org/W2765823764","https://openalex.org/W3214280620","https://openalex.org/W2794038527","https://openalex.org/W2151092287"],"abstract_inverted_index":{"As":[0],"GPUs":[1,154],"scale":[2],"their":[3],"low-precision":[4],"matrix":[5],"math":[6,19],"throughput":[7,20],"to":[8,33,72,82,91,122,151,162,174],"boost":[9],"deep":[10],"learning":[11],"(DL)":[12],"performance,":[13],"they":[14],"upset":[15],"the":[16,56,78,116,147,157,208],"balance":[17],"between":[18,38],"and":[21,43,131,136,155,166,190,198,203,206],"memory":[22,98],"system":[23,99],"capabilities.":[24],"We":[25,59,104,169],"demonstrate":[26],"that":[27,61,171],"a":[28,62,107,175,179,183],"converged":[29,176],"GPU":[30,69,75,118,177,211],"design":[31,94,149],"trying":[32],"address":[34],"diverging":[35,84],"architectural":[36],"requirements":[37],"FP32":[39],"(or":[40,45],"larger)-based":[41],"HPC":[42],"FP16":[44],"smaller)-based":[46],"DL":[47,164,167],"workloads":[48],"results":[49],"in":[50,215],"sub-optimal":[51],"configurations":[52],"for":[53],"either":[54],"of":[55,115,185,210],"application":[57,102],"domains.":[58],"argue":[60],"C":[63],"omposable":[64],"O":[65],"n-":[66],"PA":[67],"ckage":[68],"(COPA-GPU)":[70],"architecture":[71,119],"provide":[73,161],"domain-specialized":[74],"products":[76,111],"is":[77],"most":[79],"practical":[80],"solution":[81],"these":[83],"requirements.":[85],"A":[86],"COPA-GPU":[87,108,181],"leverages":[88],"multi-chip-module":[89],"disaggregation":[90],"support":[92],"maximal":[93],"reuse,":[95],"along":[96],"with":[97,120],"specialization":[100],"per":[101],"domain.":[103],"show":[105,170],"how":[106],"enables":[109],"DL-specialized":[110],"by":[112,201,213],"modular":[113],"augmentation":[114],"baseline":[117],"up":[121],"4\u00d7":[123],"higher":[124,133,192],"off-die":[125],"bandwidth,":[126],"32\u00d7":[127],"larger":[128,187],"on-package":[129],"cache,":[130],"2.3\u00d7":[132],"DRAM":[134,193],"bandwidth":[135,194],"capacity,":[137],"while":[138],"conveniently":[139],"supporting":[140],"scaled-down":[141],"HPC-oriented":[142],"designs.":[143],"This":[144],"work":[145],"explores":[146],"microarchitectural":[148],"necessary":[150],"enable":[152],"composable":[153],"evaluates":[156],"benefits":[158],"composability":[159],"can":[160],"HPC,":[163],"training,":[165],"inference.":[168],"when":[172],"compared":[173],"design,":[178],"DL-optimized":[180],"featuring":[182],"combination":[184],"16\u00d7":[186],"cache":[188],"capacity":[189],"1.6\u00d7":[191],"scales":[195],"per-GPU":[196],"training":[197,217],"inference":[199],"performance":[200],"31%":[202],"35%,":[204],"respectively,":[205],"reduces":[207],"number":[209],"instances":[212],"50%":[214],"scale-out":[216],"scenarios.":[218]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
