{"id":"https://openalex.org/W2082000934","doi":"https://doi.org/10.1109/hpca.2015.7056046","title":"Unlocking bandwidth for GPUs in CC-NUMA systems","display_name":"Unlocking bandwidth for GPUs in CC-NUMA systems","publication_year":2015,"publication_date":"2015-02-01","ids":{"openalex":"https://openalex.org/W2082000934","doi":"https://doi.org/10.1109/hpca.2015.7056046","mag":"2082000934"},"language":"en","primary_location":{"id":"doi:10.1109/hpca.2015.7056046","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca.2015.7056046","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034317620","display_name":"Neha Agarwal","orcid":"https://orcid.org/0000-0002-9029-4166"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Neha Agarwal","raw_affiliation_strings":["University of Michigan","(University of Michigan)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]},{"raw_affiliation_string":"(University of Michigan)","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031781240","display_name":"David Nellans","orcid":"https://orcid.org/0000-0001-5203-8367"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"David Nellans","raw_affiliation_strings":["NVIDIA","Nvidia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"Nvidia","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055699570","display_name":"Mike O\u2019Connor","orcid":"https://orcid.org/0000-0003-0944-2393"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mike O'Connor","raw_affiliation_strings":["NVIDIA","Nvidia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"Nvidia","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063354509","display_name":"Stephen W. Keckler","orcid":"https://orcid.org/0000-0001-6701-6099"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Stephen W. Keckler","raw_affiliation_strings":["NVIDIA","Nvidia"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"Nvidia","institution_ids":["https://openalex.org/I1304085615"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018949021","display_name":"Thomas F. Wenisch","orcid":"https://orcid.org/0000-0001-9560-2124"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Thomas F. Wenisch","raw_affiliation_strings":["NVIDIA","(University of Michigan)"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"(University of Michigan)","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":14.8246,"has_fulltext":false,"cited_by_count":81,"citation_normalized_percentile":{"value":0.9933506,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"354","last_page":"365"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8964051008224487},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6368442177772522},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.6238876581192017},{"id":"https://openalex.org/keywords/cache-coherence","display_name":"Cache coherence","score":0.5206036567687988},{"id":"https://openalex.org/keywords/virtual-memory","display_name":"Virtual memory","score":0.5136436820030212},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.49220600724220276},{"id":"https://openalex.org/keywords/non-uniform-memory-access","display_name":"Non-uniform memory access","score":0.4754338264465332},{"id":"https://openalex.org/keywords/programmer","display_name":"Programmer","score":0.45881587266921997},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.44034236669540405},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.42448610067367554},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.4119240641593933},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.4099680185317993},{"id":"https://openalex.org/keywords/cache-coloring","display_name":"Cache coloring","score":0.23782271146774292},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.21804237365722656},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.19927895069122314}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8964051008224487},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6368442177772522},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.6238876581192017},{"id":"https://openalex.org/C141917322","wikidata":"https://www.wikidata.org/wiki/Q1025017","display_name":"Cache coherence","level":5,"score":0.5206036567687988},{"id":"https://openalex.org/C76399640","wikidata":"https://www.wikidata.org/wiki/Q189401","display_name":"Virtual memory","level":4,"score":0.5136436820030212},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.49220600724220276},{"id":"https://openalex.org/C133371097","wikidata":"https://www.wikidata.org/wiki/Q868014","display_name":"Non-uniform memory access","level":5,"score":0.4754338264465332},{"id":"https://openalex.org/C2778514511","wikidata":"https://www.wikidata.org/wiki/Q1374194","display_name":"Programmer","level":2,"score":0.45881587266921997},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.44034236669540405},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.42448610067367554},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.4119240641593933},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.4099680185317993},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.23782271146774292},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.21804237365722656},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.19927895069122314}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca.2015.7056046","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca.2015.7056046","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W48345810","https://openalex.org/W1564523715","https://openalex.org/W1979306141","https://openalex.org/W1979527452","https://openalex.org/W1985539519","https://openalex.org/W1990962327","https://openalex.org/W1994023216","https://openalex.org/W1994717635","https://openalex.org/W2020733012","https://openalex.org/W2029764709","https://openalex.org/W2037291093","https://openalex.org/W2047390994","https://openalex.org/W2059290792","https://openalex.org/W2062430565","https://openalex.org/W2077076195","https://openalex.org/W2080592089","https://openalex.org/W2095954861","https://openalex.org/W2097773861","https://openalex.org/W2098278566","https://openalex.org/W2100926301","https://openalex.org/W2102843684","https://openalex.org/W2104305170","https://openalex.org/W2106342588","https://openalex.org/W2119473230","https://openalex.org/W2123815799","https://openalex.org/W2140455011","https://openalex.org/W2147926533","https://openalex.org/W2156299749","https://openalex.org/W2157802978","https://openalex.org/W2164645802","https://openalex.org/W2167414164","https://openalex.org/W2169665207","https://openalex.org/W2170666978","https://openalex.org/W2175443198","https://openalex.org/W2237817213","https://openalex.org/W2337228275","https://openalex.org/W3112651258","https://openalex.org/W3138617740","https://openalex.org/W4240262711","https://openalex.org/W4253494365","https://openalex.org/W6601941235","https://openalex.org/W6633941944","https://openalex.org/W6684730004","https://openalex.org/W6690040212","https://openalex.org/W6703177460"],"related_works":["https://openalex.org/W2113103358","https://openalex.org/W1558545464","https://openalex.org/W146324612","https://openalex.org/W1485698457","https://openalex.org/W1579918296","https://openalex.org/W1555268760","https://openalex.org/W1995350263","https://openalex.org/W2517309779","https://openalex.org/W2082000934","https://openalex.org/W1712297789"],"abstract_inverted_index":{"Historically,":[0],"GPU-based":[1],"HPC":[2],"applications":[3],"have":[4,53],"had":[5],"a":[6,26,45,144,149,173],"substantial":[7],"memory":[8,42,69,101,160,208,223],"bandwidth":[9,164],"advantage":[10],"over":[11],"CPU-based":[12],"workloads":[13],"due":[14],"to":[15,70,123,157,168,197],"using":[16,119,152,203],"GDDR":[17],"rather":[18],"than":[19,193],"DDR":[20],"memory.":[21],"However,":[22],"past":[23],"GPUs":[24,52],"required":[25,167],"restricted":[27],"programming":[28],"model":[29],"where":[30,83],"application":[31],"data":[32],"was":[33],"allocated":[34],"up":[35],"front":[36],"and":[37,57,67,91,110,147,206,210],"explicitly":[38],"copied":[39],"into":[40],"GPU":[41,46,68,98,129,198,207],"before":[43],"launching":[44],"kernel":[47],"by":[48,187,201],"the":[49,75,97,108,125,194,221],"programmer.":[50],"Recently,":[51],"eased":[54],"this":[55,104],"requirement":[56],"now":[58],"can":[59,94],"employ":[60],"on-demand":[61],"software":[62,84,120,174],"page":[63,85,121,134,139,216],"migration":[64,86,122,135],"between":[65],"CPU":[66,100,196,205],"obviate":[71],"explicit":[72],"copying.":[73],"In":[74,103],"near":[76],"future,":[77],"CC-NUMA":[78],"GPU-CPU":[79],"systems":[80],"will":[81],"appear":[82],"is":[87,166],"an":[88],"optional":[89],"choice":[90],"hardware":[92,115,179],"cache-coherence":[93,116],"also":[95],"support":[96,180],"accessing":[99],"directly.":[102],"work,":[105],"we":[106],"describe":[107],"trade-offs":[109],"considerations":[111],"in":[112],"relying":[113],"on":[114,138,182],"mechanisms":[117],"versus":[118],"optimize":[124],"performance":[126],"of":[127,214,225],"memory-intensive":[128],"workloads.":[130],"We":[131,171],"show":[132],"that":[133,148],"decisions":[136],"based":[137],"access":[140],"frequency":[141],"alone":[142],"are":[143],"poor":[145],"solution":[146,151],"broader":[150],"virtual":[153],"address-based":[154],"program":[155],"locality":[156],"enable":[158],"aggressive":[159],"prefetching":[161],"combined":[162],"with":[163],"balancing":[165],"maximize":[169],"performance.":[170],"present":[172],"runtime":[175],"system":[176],"requiring":[177],"minimal":[178],"that,":[181],"average,":[183],"outperforms":[184],"CC-NUMA-based":[185],"accesses":[186],"1.95":[188],"\u00d7,":[189],"performs":[190],"6%":[191],"better":[192],"legacy":[195],"memcpy":[199],"regime":[200],"intelligently":[202],"both":[204],"bandwidth,":[209],"comes":[211],"within":[212],"28%":[213],"oracular":[215],"placement,":[217],"all":[218],"while":[219],"maintaining":[220],"relaxed":[222],"semantics":[224],"modern":[226],"GPUs.":[227]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":4},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":8},{"year":2018,"cited_by_count":9},{"year":2017,"cited_by_count":20},{"year":2016,"cited_by_count":12},{"year":2015,"cited_by_count":4}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
