{"id":"https://openalex.org/W4409248577","doi":"https://doi.org/10.1109/hpca61900.2025.00125","title":"NearFetch: Saving Inter-Module Bandwidth in Many-Chip-Module GPUs","display_name":"NearFetch: Saving Inter-Module Bandwidth in Many-Chip-Module GPUs","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4409248577","doi":"https://doi.org/10.1109/hpca61900.2025.00125"},"language":"en","primary_location":{"id":"doi:10.1109/hpca61900.2025.00125","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00125","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5072444630","display_name":"Xia Zhao","orcid":"https://orcid.org/0000-0001-6479-9200"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xia Zhao","raw_affiliation_strings":["Defense Innovation Institute,Academy of Military Science"],"affiliations":[{"raw_affiliation_string":"Defense Innovation Institute,Academy of Military Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049705266","display_name":"Guangda Zhang","orcid":"https://orcid.org/0000-0003-4732-9674"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guangda Zhang","raw_affiliation_strings":["Defense Innovation Institute,Academy of Military Science"],"affiliations":[{"raw_affiliation_string":"Defense Innovation Institute,Academy of Military Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100364462","display_name":"Lu Wang","orcid":"https://orcid.org/0000-0002-5881-6139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu Wang","raw_affiliation_strings":["Defense Innovation Institute,Academy of Military Science"],"affiliations":[{"raw_affiliation_string":"Defense Innovation Institute,Academy of Military Science","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101891026","display_name":"Shiqing Zhang","orcid":"https://orcid.org/0000-0002-6690-3718"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shiqing Zhang","raw_affiliation_strings":["Defense Innovation Institute,Academy of Military Science"],"affiliations":[{"raw_affiliation_string":"Defense Innovation Institute,Academy of Military Science","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112124785","display_name":"Huadong Dai","orcid":"https://orcid.org/0000-0002-9722-5454"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huadong Dai","raw_affiliation_strings":["Defense Innovation Institute,Academy of Military Science"],"affiliations":[{"raw_affiliation_string":"Defense Innovation Institute,Academy of Military Science","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5072444630"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.1163,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.93700202,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1693","last_page":"1706"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9150999784469604,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9150999784469604,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7315573692321777},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.6032143831253052},{"id":"https://openalex.org/keywords/chip","display_name":"Chip","score":0.5228571891784668},{"id":"https://openalex.org/keywords/system-on-a-chip","display_name":"System on a chip","score":0.46493077278137207},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.43247929215431213},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3954419493675232},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.37895432114601135},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.14540129899978638},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.08627656102180481}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7315573692321777},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.6032143831253052},{"id":"https://openalex.org/C165005293","wikidata":"https://www.wikidata.org/wiki/Q1074500","display_name":"Chip","level":2,"score":0.5228571891784668},{"id":"https://openalex.org/C118021083","wikidata":"https://www.wikidata.org/wiki/Q610398","display_name":"System on a chip","level":2,"score":0.46493077278137207},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.43247929215431213},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3954419493675232},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.37895432114601135},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.14540129899978638},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08627656102180481}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca61900.2025.00125","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00125","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","score":0.8100000023841858,"id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320326444","display_name":"Nova","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":39,"referenced_works":["https://openalex.org/W1979527452","https://openalex.org/W2034861439","https://openalex.org/W2080592089","https://openalex.org/W2093043622","https://openalex.org/W2118068686","https://openalex.org/W2128120785","https://openalex.org/W2157802978","https://openalex.org/W2401377454","https://openalex.org/W2416722775","https://openalex.org/W2441593292","https://openalex.org/W2612695082","https://openalex.org/W2625200202","https://openalex.org/W2626312854","https://openalex.org/W2761710529","https://openalex.org/W2795081729","https://openalex.org/W2883882491","https://openalex.org/W2884108789","https://openalex.org/W2886156724","https://openalex.org/W2903659818","https://openalex.org/W2929862812","https://openalex.org/W2933483662","https://openalex.org/W2980104813","https://openalex.org/W2984139344","https://openalex.org/W3017188964","https://openalex.org/W3017302221","https://openalex.org/W3089469538","https://openalex.org/W3089681336","https://openalex.org/W3099807387","https://openalex.org/W3100877701","https://openalex.org/W3102510044","https://openalex.org/W3205535571","https://openalex.org/W4254918363","https://openalex.org/W4280589640","https://openalex.org/W4297097318","https://openalex.org/W4297097426","https://openalex.org/W4308083836","https://openalex.org/W4318541666","https://openalex.org/W4390415044","https://openalex.org/W6764609847"],"related_works":["https://openalex.org/W2065289416","https://openalex.org/W2017236304","https://openalex.org/W3142211975","https://openalex.org/W2115579119","https://openalex.org/W2136854845","https://openalex.org/W1879443270","https://openalex.org/W2018912978","https://openalex.org/W2130914040","https://openalex.org/W2119122672","https://openalex.org/W4292904049"],"abstract_inverted_index":{"As":[0],"Graphics":[1],"Processing":[2,56],"Units":[3,57],"(GPUs)":[4],"face":[5],"increasing":[6],"computing":[7],"demands":[8],"that":[9,100,178,299],"surpass":[10],"single-module":[11],"capabilities":[12],"due":[13,174],"to":[14,68,83,125,169,175,189,273,319],"transistor":[15],"scaling":[16],"and":[17,149,159,229,308],"lithography":[18],"constraints,":[19,85],"the":[20,24,36,87,116,138,151,157,176,182,190,221,242,257,271,285],"necessity":[21],"for":[22,145,204,240,246,325],"expanding":[23],"module":[25,123,193],"count":[26,154],"within":[27,284],"GPUs":[28,42,62,80,108],"grows.":[29],"This":[30,77],"escalation":[31],"faces":[32],"a":[33,166,213,225,230,252],"significant":[34],"challenge:":[35],"total":[37],"inter-module":[38,92,267],"bandwidth":[39,84,93,112,120,147,268],"in":[40,48,106,224,249],"many-chip-module":[41,107,288],"is":[43],"limited":[44],"by":[45,303,311],"manufacturing":[46],"constraints":[47],"organic":[49],"substrates":[50],"or":[51],"silicon":[52],"interposers.":[53],"Unlike":[54],"Central":[55],"(CPUs),":[58],"which":[59,114],"are":[60,185],"latency-sensitive,":[61],"leverage":[63],"their":[64],"high":[65],"thread-level":[66],"parallelism":[67],"effectively":[69],"hide":[70],"memory":[71,105,243,278],"access":[72],"latency":[73],"through":[74],"simultaneous":[75],"multithreading.":[76],"attribute":[78],"makes":[79],"inherently":[81],"sensitive":[82],"making":[86],"efficient":[88],"exploitation":[89],"of":[90,202,209,251,282,287,295,314],"available":[91],"important.":[94],"In":[95],"this":[96,134],"paper,":[97],"we":[98,136,163],"identify":[99],"fetching":[101,126],"data":[102,119,127,203,214,218,222,253,258,275,296],"from":[103,128,276],"faraway":[104],"can":[109,198],"easily":[110],"cause":[111],"contention":[113],"degrades":[115],"real":[117],"achieved":[118],"per":[121,141],"GPU":[122,179,192,196,227,262],"compared":[124],"nearby":[129],"memory.":[130],"To":[131],"further":[132],"analyze":[133],"problem,":[135],"introduce":[137],"Inter-Module":[139],"Bandwidth":[140],"Access":[142],"(IBPA)":[143],"metric":[144],"quantifying":[146],"usage":[148],"finding":[150],"network":[152,160],"hop":[153],"directly":[155],"impacts":[156],"IBPA":[158,302],"contention.":[161],"Next,":[162],"propose":[164],"NearFetch,":[165],"routing-based":[167],"solution":[168],"reduce":[170],"IBPA.":[171],"NearFetch":[172,207,264,283],"works":[173],"fact":[177],"modules":[180,197],"along":[181],"routing":[183],"path":[184],"typically":[186],"much":[187],"closer":[188],"source":[191],"while":[194],"these":[195],"supply":[199],"$29.1":[200],"\\%$":[201,307,316,323],"high-sharing":[205,326],"applications.":[206],"consists":[208],"two":[210],"primary":[211],"components:":[212],"forwarding":[215,219],"scheme,":[216,238],"enabling":[217],"when":[220],"resides":[223],"remote":[226],"module,":[228],"topology-aware":[231],"Miss":[232],"Status":[233],"Handling":[234],"Register":[235],"(MSHR)":[236],"coalescing":[237],"responsible":[239],"recording":[241],"address":[244],"information":[245],"future":[247],"use":[248],"case":[250],"miss.":[254],"By":[255],"leveraging":[256],"locality":[259],"among":[260],"various":[261],"modules,":[263],"substantially":[265],"minimizes":[266],"usage,":[269],"eliminating":[270],"need":[272],"fetch":[274],"distant":[277],"partitions.":[279],"Our":[280],"evaluation":[281],"context":[286],"GPUs,":[289],"across":[290],"applications":[291],"exhibiting":[292],"diverse":[293],"degrees":[294],"locality,":[297],"reveals":[298],"it":[300],"reduces":[301],"$4":[304],"2.":[305],"6":[306],"enhances":[309],"performance":[310],"an":[312],"average":[313],"$52.2":[315],"(with":[317],"up":[318],"$9":[320],"8.":[321],"1":[322],"improvement)":[324],"workloads.":[327]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-06T13:50:29.536080","created_date":"2025-10-10T00:00:00"}
