{"id":"https://openalex.org/W4360831776","doi":"https://doi.org/10.1109/hpca56546.2023.10070949","title":"FinePack: Transparently Improving the Efficiency of Fine-Grained Transfers in Multi-GPU Systems","display_name":"FinePack: Transparently Improving the Efficiency of Fine-Grained Transfers in Multi-GPU Systems","publication_year":2023,"publication_date":"2023-02-01","ids":{"openalex":"https://openalex.org/W4360831776","doi":"https://doi.org/10.1109/hpca56546.2023.10070949"},"language":"en","primary_location":{"id":"doi:10.1109/hpca56546.2023.10070949","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca56546.2023.10070949","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006855402","display_name":"Harikrishnan Muthukrishnan","orcid":"https://orcid.org/0009-0001-7938-9623"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Harini Muthukrishnan","raw_affiliation_strings":["NVIDIA","University of Michigan"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]},{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050711769","display_name":"Daniel C. Lustig","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Daniel Lustig","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111373927","display_name":"Oreste Villa","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Oreste Villa","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018949021","display_name":"Thomas F. Wenisch","orcid":"https://orcid.org/0000-0001-9560-2124"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Thomas Wenisch","raw_affiliation_strings":["University of Michigan"],"affiliations":[{"raw_affiliation_string":"University of Michigan","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031781240","display_name":"David Nellans","orcid":"https://orcid.org/0000-0001-5203-8367"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"David Nellans","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5006855402"],"corresponding_institution_ids":["https://openalex.org/I27837315"],"apc_list":null,"apc_paid":null,"fwci":2.7721,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.90201005,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"516","last_page":"529"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8858336806297302},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7390718460083008},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7048074007034302},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.5710238814353943},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.48654261231422424},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.4799066483974457},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.44733935594558716},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.29161641001701355},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.21395975351333618},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.13273030519485474}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8858336806297302},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7390718460083008},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7048074007034302},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.5710238814353943},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.48654261231422424},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.4799066483974457},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.44733935594558716},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.29161641001701355},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.21395975351333618},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.13273030519485474},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca56546.2023.10070949","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca56546.2023.10070949","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W1086484114","https://openalex.org/W1497413785","https://openalex.org/W1542643269","https://openalex.org/W1902903569","https://openalex.org/W1979717209","https://openalex.org/W1986239736","https://openalex.org/W1994999558","https://openalex.org/W1997352364","https://openalex.org/W2035080386","https://openalex.org/W2080371694","https://openalex.org/W2082482658","https://openalex.org/W2102843684","https://openalex.org/W2102850792","https://openalex.org/W2119407198","https://openalex.org/W2129290990","https://openalex.org/W2157802978","https://openalex.org/W2159184943","https://openalex.org/W2245726859","https://openalex.org/W2532617160","https://openalex.org/W2546702912","https://openalex.org/W2613380095","https://openalex.org/W2621152531","https://openalex.org/W2624632852","https://openalex.org/W2625200202","https://openalex.org/W2761710529","https://openalex.org/W2767248915","https://openalex.org/W2767308239","https://openalex.org/W2794598778","https://openalex.org/W2903659818","https://openalex.org/W2903901007","https://openalex.org/W2926767350","https://openalex.org/W2945649828","https://openalex.org/W2953357877","https://openalex.org/W2962911728","https://openalex.org/W2971357983","https://openalex.org/W2979340153","https://openalex.org/W3011293047","https://openalex.org/W3017188964","https://openalex.org/W3017302221","https://openalex.org/W3036082746","https://openalex.org/W3037847693","https://openalex.org/W3101708369","https://openalex.org/W3157055696","https://openalex.org/W3189320660","https://openalex.org/W3206003350","https://openalex.org/W4245836145","https://openalex.org/W4280604807","https://openalex.org/W6627141550","https://openalex.org/W6632471700","https://openalex.org/W6649080255","https://openalex.org/W6745833556","https://openalex.org/W6753584990","https://openalex.org/W6767312673"],"related_works":["https://openalex.org/W2161462353","https://openalex.org/W1569389315","https://openalex.org/W2503642292","https://openalex.org/W3120511008","https://openalex.org/W2066208787","https://openalex.org/W4296337698","https://openalex.org/W1997955449","https://openalex.org/W164750744","https://openalex.org/W2292897598","https://openalex.org/W2769126660"],"abstract_inverted_index":{"Recent":[0],"studies":[1],"have":[2,96],"shown":[3],"that":[4,95,113,122,155],"using":[5,46],"fine-grained":[6],"peer-to-peer":[7,116,132,199],"(P2P)":[8],"stores":[9,42,117,200],"to":[10,21,76,80,90,118,164,170,191],"communicate":[11],"among":[12],"devices":[13],"in":[14,54,205],"multi-GPU":[15,55,216],"systems":[16,56],"is":[17,51,161],"a":[18,70,103,131,151,180,187],"promising":[19],"path":[20],"achieve":[22,119],"strong":[23,207,226],"performance":[24,98,209],"scaling.":[25],"In":[26],"many":[27],"irregular":[28],"applications,":[29],"such":[30],"as":[31],"graph":[32],"algorithms":[33],"and":[34,109,146,166,218],"sparse":[35],"linear":[36],"algebra,":[37],"small":[38,67,84,115,148,198],"sub-cache":[39],"line":[40],"(4-32B)":[41],"arise":[43],"naturally":[44],"when":[45],"the":[47,128,138,171,223],"P2P":[48],"paradigm.":[49],"This":[50,203],"particularly":[52],"problematic":[53],"because":[57],"inter-GPU":[58,92],"interconnects":[59],"are":[60],"optimized":[61],"for":[62,197],"bulk":[63,91,124],"transfers":[64,94,125],"rather":[65],"than":[66,212],"operations.":[68],"As":[69],"consequence,":[71],"application":[72],"developers":[73],"either":[74],"resort":[75],"complex":[77],"programming":[78,135,217],"techniques":[79],"work":[81],"around":[82],"this":[83],"transfer":[85],"inefficiency":[86],"or":[87],"fall":[88],"back":[89],"DMA":[93,214],"limited":[97,106],"scalability.":[99],"We":[100,176],"propose":[101],"FinePack,":[102],"set":[104],"of":[105,130,222],"I/O":[107,153],"interconnect":[108,120,190,195],"GPU":[110],"hardware":[111],"enhancements":[112],"enable":[114],"efficiency":[121,196],"rivals":[123],"while":[126],"maintaining":[127],"simplicity":[129],"memory":[133,141,174],"access":[134],"model.":[136],"Exploiting":[137],"GPU\u2019s":[139,172],"weak":[140],"model,":[142],"FinePack":[143,160,178,193],"dynamically":[144],"coalesces":[145],"compresses":[147],"writes":[149],"into":[150],"larger":[152],"message":[154],"reduces":[156],"link-level":[157],"protocol":[158],"overhead.":[159],"fully":[162],"transparent":[163],"software":[165],"requires":[167],"no":[168],"changes":[169],"virtual":[173],"system.":[175],"evaluate":[177],"on":[179,186],"system":[181],"comprising":[182],"4":[183],"Volta":[184],"GPUs":[185],"PCIe":[188],"4.0":[189],"show":[192],"improves":[194],"by":[201],"3\u00d7.":[202],"results":[204],"4-GPU":[206],"scaling":[208,227],"1.4\u00d7":[210],"better":[211],"traditional":[213],"based":[215],"comes":[219],"within":[220],"71%":[221],"maximum":[224],"achievable":[225],"performance.":[228]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
