{"id":"https://openalex.org/W4320063633","doi":"https://doi.org/10.1145/3559009.3569649","title":"Locality-Aware Optimizations for Improving Remote Memory Latency in Multi-GPU Systems","display_name":"Locality-Aware Optimizations for Improving Remote Memory Latency in Multi-GPU Systems","publication_year":2022,"publication_date":"2022-10-08","ids":{"openalex":"https://openalex.org/W4320063633","doi":"https://doi.org/10.1145/3559009.3569649"},"language":"en","primary_location":{"id":"doi:10.1145/3559009.3569649","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3559009.3569649","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5029538820","display_name":"Leul Belayneh","orcid":null},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Leul Belayneh","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028799837","display_name":"Haojie Ye","orcid":"https://orcid.org/0000-0001-5360-5159"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Haojie Ye","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100714448","display_name":"Kuan-Yu Chen","orcid":"https://orcid.org/0000-0002-4168-6446"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kuan-Yu Chen","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026311377","display_name":"David Blaauw","orcid":"https://orcid.org/0000-0001-6744-7075"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"David Blaauw","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037541525","display_name":"Trevor Mudge","orcid":"https://orcid.org/0000-0001-7845-2187"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Trevor Mudge","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014250626","display_name":"Ronald Dreslinski","orcid":null},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ronald Dreslinski","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5063661349","display_name":"Nishil Talati","orcid":"https://orcid.org/0000-0002-2457-4119"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nishil Talati","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5029538820"],"corresponding_institution_ids":["https://openalex.org/I27837315"],"apc_list":null,"apc_paid":null,"fwci":1.3623,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.8040201,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"304","last_page":"316"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.908491849899292},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7535715103149414},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.650196373462677},{"id":"https://openalex.org/keywords/cache-pollution","display_name":"Cache pollution","score":0.6226000785827637},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.5849761366844177},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.5699213743209839},{"id":"https://openalex.org/keywords/page-cache","display_name":"Page cache","score":0.5215125679969788},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.5056054592132568},{"id":"https://openalex.org/keywords/locality-of-reference","display_name":"Locality of reference","score":0.4603244364261627},{"id":"https://openalex.org/keywords/smart-cache","display_name":"Smart Cache","score":0.42236316204071045},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.4217662513256073},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.41091787815093994},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.4022967219352722},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.39479416608810425},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3356937766075134}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.908491849899292},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7535715103149414},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.650196373462677},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.6226000785827637},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.5849761366844177},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.5699213743209839},{"id":"https://openalex.org/C36340418","wikidata":"https://www.wikidata.org/wiki/Q7124288","display_name":"Page cache","level":5,"score":0.5215125679969788},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.5056054592132568},{"id":"https://openalex.org/C27602214","wikidata":"https://www.wikidata.org/wiki/Q1868547","display_name":"Locality of reference","level":3,"score":0.4603244364261627},{"id":"https://openalex.org/C167713795","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"Smart Cache","level":5,"score":0.42236316204071045},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.4217662513256073},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.41091787815093994},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.4022967219352722},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.39479416608810425},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3356937766075134},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3559009.3569649","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3559009.3569649","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the International Conference on Parallel Architectures and Compilation Techniques","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5299999713897705,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1980136882","https://openalex.org/W1986089612","https://openalex.org/W1989061323","https://openalex.org/W2007511350","https://openalex.org/W2008115889","https://openalex.org/W2018337509","https://openalex.org/W2036953536","https://openalex.org/W2047060659","https://openalex.org/W2082982763","https://openalex.org/W2092406519","https://openalex.org/W2120829734","https://openalex.org/W2149234156","https://openalex.org/W2528784626","https://openalex.org/W2559253174","https://openalex.org/W2566040696","https://openalex.org/W2612387305","https://openalex.org/W2625200202","https://openalex.org/W2730965233","https://openalex.org/W2761710529","https://openalex.org/W2883338591","https://openalex.org/W2883882491","https://openalex.org/W2885000039","https://openalex.org/W2903659818","https://openalex.org/W2910737925","https://openalex.org/W2950969115","https://openalex.org/W2952928793","https://openalex.org/W2984139344","https://openalex.org/W3011293047","https://openalex.org/W3017302221","https://openalex.org/W3036082746","https://openalex.org/W3089681336","https://openalex.org/W3189320660","https://openalex.org/W3195003484","https://openalex.org/W3206003350","https://openalex.org/W4229602664","https://openalex.org/W4235543475"],"related_works":["https://openalex.org/W2133489088","https://openalex.org/W2114386333","https://openalex.org/W2118932116","https://openalex.org/W2363769136","https://openalex.org/W2396934146","https://openalex.org/W2535115842","https://openalex.org/W2148571123","https://openalex.org/W2539712666","https://openalex.org/W1970102182","https://openalex.org/W2029311465"],"abstract_inverted_index":{"With":[0],"generational":[1],"gains":[2],"from":[3,54],"transistor":[4],"scaling,":[5],"GPUs":[6],"have":[7,44,124],"been":[8],"able":[9,28],"to":[10,29,61,89,106,167],"accelerate":[11],"traditional":[12],"computation-intensive":[13],"workloads.":[14,38],"But":[15],"with":[16,212],"the":[17,31,62,80,96,133,138,154,193,199],"obsolescence":[18],"of":[19,36,99,137,149,153,195,217],"Moore's":[20],"Law,":[21],"single":[22],"GPU":[23],"systems":[24,51],"are":[25,52,162],"no":[26],"longer":[27],"satisfy":[30],"computational":[32],"and":[33,112,197],"memory":[34,82,101],"requirements":[35],"emerging":[37],"To":[39],"remedy":[40],"this,":[41],"prior":[42],"works":[43],"proposed":[45],"tightly-coupled":[46],"multi-GPU":[47,50],"systems.":[48],"However,":[49],"hampered":[53],"efficiently":[55,168],"utilizing":[56],"their":[57],"compute":[58],"resources":[59],"due":[60],"Non-Uniform":[63],"Memory":[64],"Access":[65],"(NUMA)":[66],"bottleneck.":[67],"In":[68],"this":[69],"paper,":[70],"we":[71],"propose":[72],"DualOpt,":[73],"a":[74,90,104,146,177,188,209,213],"lightweight":[75],"hardware-only":[76],"solution":[77],"that":[78,127,180],"reduces":[79],"remote":[81,100,150,160,185],"access":[83],"latency":[84],"by":[85,132,206],"delivering":[86],"optimizations":[87],"catered":[88],"workload's":[91],"locality":[92,98,126,183],"profile.":[93],"DualOpt":[94,144,175],"uses":[95],"spatio-temporal":[97,119],"accesses":[102],"as":[103,109,166],"metric":[105],"classify":[107],"workloads":[108,116,123],"cache":[110,135,141,156,179],"insensitive":[111,115,142],"cache-friendly.":[113],"Cache":[114],"exhibit":[117],"low":[118],"locality,":[120],"while":[121],"cache-friendly":[122,173],"ample":[125],"is":[128],"not":[129],"exploited":[130],"well":[131],"conventional":[134,155],"subsystem":[136],"GPU.":[139],"For":[140,172],"workloads,":[143,174],"proposes":[145],"fine-granularity":[147],"transfer":[148],"data":[151,161],"instead":[152],"line":[157],"transfer.":[158],"These":[159],"then":[163],"coalesced":[164],"so":[165],"utilize":[169],"inter-GPU":[170],"bandwidth.":[171],"adds":[176],"remote-only":[178],"can":[181],"exploit":[182],"in":[184],"accesses.":[186],"Finally,":[187],"decision":[189],"engine":[190],"automatically":[191],"identifies":[192],"class":[194],"workload":[196],"delivers":[198],"corresponding":[200],"optimization,":[201],"which":[202],"improves":[203],"overall":[204],"performance":[205],"2.5\u00d7":[207],"on":[208],"4-GPU":[210],"system,":[211],"small":[214],"hardware":[215],"overhead":[216],"0.032%.":[218]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
