{"id":"https://openalex.org/W7133558841","doi":"https://doi.org/10.1109/hpca68181.2026.11408440","title":"QuCo: Efficient and Flexible Hardware-Driven Automatic Configuration of Tile Transfers in GPUs","display_name":"QuCo: Efficient and Flexible Hardware-Driven Automatic Configuration of Tile Transfers in GPUs","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7133558841","doi":"https://doi.org/10.1109/hpca68181.2026.11408440"},"language":null,"primary_location":{"id":"doi:10.1109/hpca68181.2026.11408440","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408440","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5117528024","display_name":"Nicol\u00e1s Meseguer","orcid":"https://orcid.org/0000-0002-2243-0744"},"institutions":[{"id":"https://openalex.org/I80180929","display_name":"Universidad de Murcia","ror":"https://ror.org/03p3aeb86","country_code":"ES","type":"education","lineage":["https://openalex.org/I80180929"]}],"countries":["ES"],"is_corresponding":true,"raw_author_name":"Nicol\u00e1s Meseguer","raw_affiliation_strings":["Universidad de Murcia"],"affiliations":[{"raw_affiliation_string":"Universidad de Murcia","institution_ids":["https://openalex.org/I80180929"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108149719","display_name":"Daoxuan Xu","orcid":"https://orcid.org/0009-0003-7093-5530"},"institutions":[{"id":"https://openalex.org/I16285277","display_name":"William & Mary","ror":"https://ror.org/03hsf0573","country_code":"US","type":"education","lineage":["https://openalex.org/I16285277"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Daoxuan Xu","raw_affiliation_strings":["William &#x0026; Mary"],"affiliations":[{"raw_affiliation_string":"William &#x0026; Mary","institution_ids":["https://openalex.org/I16285277"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128075756","display_name":"Yifan Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I16285277","display_name":"William & Mary","ror":"https://ror.org/03hsf0573","country_code":"US","type":"education","lineage":["https://openalex.org/I16285277"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yifan Sun","raw_affiliation_strings":["William &#x0026; Mary"],"affiliations":[{"raw_affiliation_string":"William &#x0026; Mary","institution_ids":["https://openalex.org/I16285277"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009910914","display_name":"Michael Pellauer","orcid":"https://orcid.org/0000-0002-5305-4307"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Michael Pellauer","raw_affiliation_strings":["NVIDIA"],"affiliations":[{"raw_affiliation_string":"NVIDIA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007832959","display_name":"Jos\u00e9 Luis Abell\u00e1n","orcid":"https://orcid.org/0000-0003-3550-720X"},"institutions":[{"id":"https://openalex.org/I80180929","display_name":"Universidad de Murcia","ror":"https://ror.org/03p3aeb86","country_code":"ES","type":"education","lineage":["https://openalex.org/I80180929"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Jos\u00e9 L. Abell\u00e1n","raw_affiliation_strings":["Universidad de Murcia"],"affiliations":[{"raw_affiliation_string":"Universidad de Murcia","institution_ids":["https://openalex.org/I80180929"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5095426546","display_name":"Manuel Acacio","orcid":null},"institutions":[{"id":"https://openalex.org/I80180929","display_name":"Universidad de Murcia","ror":"https://ror.org/03p3aeb86","country_code":"ES","type":"education","lineage":["https://openalex.org/I80180929"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Manuel E. Acacio","raw_affiliation_strings":["Universidad de Murcia"],"affiliations":[{"raw_affiliation_string":"Universidad de Murcia","institution_ids":["https://openalex.org/I80180929"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5117528024"],"corresponding_institution_ids":["https://openalex.org/I80180929"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41200231,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"14"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.1882999986410141,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.1882999986410141,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12029","display_name":"DNA and Biological Computing","score":0.13019999861717224,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.1290999948978424,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/tile","display_name":"Tile","score":0.2865999937057495},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.27549999952316284},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.2599000036716461},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.25769999623298645}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6514000296592712},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.33169999718666077},{"id":"https://openalex.org/C2780728851","wikidata":"https://www.wikidata.org/wiki/Q468402","display_name":"Tile","level":2,"score":0.2865999937057495},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C199639397","wikidata":"https://www.wikidata.org/wiki/Q1788588","display_name":"Engineering drawing","level":1,"score":0.2827000021934509},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26739999651908875},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2329999953508377}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca68181.2026.11408440","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca68181.2026.11408440","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2026 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.7824560403823853,"display_name":"Affordable and clean energy"}],"awards":[{"id":"https://openalex.org/G7062980579","display_name":null,"funder_award_id":"PID2022-136315OB-I00,MICIU/AEI/10.13039/501100011033","funder_id":"https://openalex.org/F4320309359","funder_display_name":"Elon University"},{"id":"https://openalex.org/G8214853840","display_name":null,"funder_award_id":"2246035,2402804","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309359","display_name":"Elon University","ror":"https://ror.org/01szgyb91"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W2002257715","https://openalex.org/W2049875313","https://openalex.org/W2098118692","https://openalex.org/W2152956697","https://openalex.org/W2171399035","https://openalex.org/W2252007067","https://openalex.org/W2294600019","https://openalex.org/W2608738900","https://openalex.org/W2794887017","https://openalex.org/W2895553128","https://openalex.org/W2941684109","https://openalex.org/W2952928793","https://openalex.org/W2981758446","https://openalex.org/W2997740413","https://openalex.org/W3042016589","https://openalex.org/W3102510044","https://openalex.org/W3174357167","https://openalex.org/W3181975841","https://openalex.org/W4231970275","https://openalex.org/W4244089596","https://openalex.org/W4250482878","https://openalex.org/W4280626493","https://openalex.org/W4306179717","https://openalex.org/W4308968707","https://openalex.org/W4311145589","https://openalex.org/W4312258136","https://openalex.org/W4318256787","https://openalex.org/W4324292875","https://openalex.org/W4389476360","https://openalex.org/W4393407047","https://openalex.org/W4400409880","https://openalex.org/W4400960979","https://openalex.org/W4404787869","https://openalex.org/W4409133352","https://openalex.org/W4411260261"],"related_works":[],"abstract_inverted_index":{"The":[0],"growing":[1],"complexity":[2],"and":[3,22,51,59,69,86,133,135,148,174],"parallelism":[4],"demands":[5],"of":[6,55,77,167,172],"modern":[7],"GPU":[8,62,109,120,137],"workloads":[9],"have":[10],"driven":[11],"architectural":[12,143],"innovations":[13],"toward":[14],"asynchronous":[15],"tile":[16,47],"transfers":[17],"(ATTs)":[18],"to":[19,74],"overlap":[20],"computation":[21],"data":[23],"movement.":[24],"While":[25],"ATT":[26,67,114],"units":[27],"such":[28],"as":[29],"the":[30,108,113,146,165,168],"NVIDIA's":[31],"Tensor":[32],"Memory":[33],"Accelerator":[34],"(TMA)":[35],"introduce":[36],"high-throughput":[37],"memory":[38,129],"transfers,":[39],"programmers":[40],"must":[41],"deal":[42],"with":[43,152],"wavefront":[44],"specialization,":[45],"select":[46],"sizes,":[48],"queue":[49,158],"slots,":[50],"synchronization":[52],"primitives,":[53],"all":[54],"which":[56,82],"are":[57],"hardware-specific":[58],"workloaddependent.":[60],"Existing":[61],"libraries":[63],"fall":[64],"short-offering":[65],"limited":[66],"support":[68],"configurability-so":[70],"developers":[71],"still":[72],"resort":[73],"manual":[75],"exploration":[76],"this":[78,94],"vast":[79],"parameter":[80],"space,":[81],"is":[83],"laborious,":[84],"error-prone,":[85],"fundamentally":[87],"limits":[88],"performance":[89,181],"portability":[90],"across":[91],"GPUs.":[92],"In":[93],"work,":[95],"we":[96],"present":[97],"QuCo":[98,122,155],"(Queue":[99],"Configurator),":[100],"a":[101,124,136],"single":[102],"lightweight":[103],"hardware":[104],"unit":[105],"embedded":[106],"in":[107],"that":[110],"fully":[111],"automates":[112],"configuration":[115],"process.":[116],"Inspired":[117],"by":[118],"Blackwell":[119],"design,":[121],"includes":[123],"compact":[125],"RISC-V":[126],"processor,":[127],"small":[128],"structures":[130],"for":[131],"instructions":[132],"data,":[134],"Specification":[138],"Table":[139],"(GST)":[140],"storing":[141],"key":[142],"parameters.":[144],"Using":[145],"GST":[147],"workload":[149],"characteristics,":[150],"along":[151],"built-in":[153],"heuristics,":[154],"computes":[156],"optimal":[157],"configurations":[159],"at":[160],"kernel":[161],"launch.":[162],"This":[163],"relieves":[164],"programmer":[166],"tedious,":[169],"time-consuming":[170],"task":[171],"tuning":[173],"offline":[175],"profiling,":[176],"while":[177],"simultaneously":[178],"increasing":[179],"post-compilation":[180],"portability.":[182]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-03-05T00:00:00"}
