{"id":"https://openalex.org/W4417403177","doi":"https://doi.org/10.1109/pact65351.2025.00046","title":"CoroAMU: Unleashing Memory-Driven Coroutines through Latency-Aware Decoupled Operations","display_name":"CoroAMU: Unleashing Memory-Driven Coroutines through Latency-Aware Decoupled Operations","publication_year":2025,"publication_date":"2025-11-03","ids":{"openalex":"https://openalex.org/W4417403177","doi":"https://doi.org/10.1109/pact65351.2025.00046"},"language":null,"primary_location":{"id":"doi:10.1109/pact65351.2025.00046","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00046","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2511.14990","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5020517480","display_name":"Zhuolun Jiang","orcid":"https://orcid.org/0009-0006-9434-7399"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhuolun Jiang","raw_affiliation_strings":["Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034245729","display_name":"Songyue Wang","orcid":"https://orcid.org/0009-0003-2732-7820"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songyue Wang","raw_affiliation_strings":["Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000276367","display_name":"X. Pei","orcid":"https://orcid.org/0000-0003-3847-6162"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaokun Pei","raw_affiliation_strings":["Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120823938","display_name":"Tianyue","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tianyue","raw_affiliation_strings":["Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101645579","display_name":"Mingyu Chen","orcid":"https://orcid.org/0000-0003-4469-1037"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingyu Chen","raw_affiliation_strings":["Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology Chinese Academy of Sciences,State Key Lab of Processors","institution_ids":["https://openalex.org/I4210090176"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5020517480"],"corresponding_institution_ids":["https://openalex.org/I4210090176"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.46648252,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"444","last_page":"457"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.838699996471405,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.838699996471405,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.03280000016093254,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.03240000084042549,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6366999745368958},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5465999841690063},{"id":"https://openalex.org/keywords/interleaving","display_name":"Interleaving","score":0.516700029373169},{"id":"https://openalex.org/keywords/interleaved-memory","display_name":"Interleaved memory","score":0.5022000074386597},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.47269999980926514},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.462799996137619},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4498000144958496},{"id":"https://openalex.org/keywords/cas-latency","display_name":"CAS latency","score":0.43689998984336853},{"id":"https://openalex.org/keywords/memory-map","display_name":"Memory map","score":0.4221000075340271}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8847000002861023},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6366999745368958},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5465999841690063},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.517300009727478},{"id":"https://openalex.org/C28034677","wikidata":"https://www.wikidata.org/wiki/Q17092530","display_name":"Interleaving","level":2,"score":0.516700029373169},{"id":"https://openalex.org/C63511323","wikidata":"https://www.wikidata.org/wiki/Q908936","display_name":"Interleaved memory","level":4,"score":0.5022000074386597},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.47269999980926514},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.462799996137619},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4498000144958496},{"id":"https://openalex.org/C189930140","wikidata":"https://www.wikidata.org/wiki/Q1112878","display_name":"CAS latency","level":4,"score":0.43689998984336853},{"id":"https://openalex.org/C74426580","wikidata":"https://www.wikidata.org/wiki/Q719484","display_name":"Memory map","level":3,"score":0.4221000075340271},{"id":"https://openalex.org/C93446704","wikidata":"https://www.wikidata.org/wiki/Q449328","display_name":"Registered memory","level":3,"score":0.3499999940395355},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3472999930381775},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.33709999918937683},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.3346000015735626},{"id":"https://openalex.org/C153247305","wikidata":"https://www.wikidata.org/wiki/Q835713","display_name":"Memory address","level":3,"score":0.33390000462532043},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3001999855041504},{"id":"https://openalex.org/C57863822","wikidata":"https://www.wikidata.org/wiki/Q905488","display_name":"Flat memory model","level":4,"score":0.2896000146865845},{"id":"https://openalex.org/C128916667","wikidata":"https://www.wikidata.org/wiki/Q1343660","display_name":"Register allocation","level":3,"score":0.28369998931884766},{"id":"https://openalex.org/C53838383","wikidata":"https://www.wikidata.org/wiki/Q541148","display_name":"Conventional memory","level":5,"score":0.2773999869823456},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.2770000100135803},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C171675096","wikidata":"https://www.wikidata.org/wiki/Q1143380","display_name":"Extended memory","level":4,"score":0.2612000107765198},{"id":"https://openalex.org/C37724790","wikidata":"https://www.wikidata.org/wiki/Q210813","display_name":"Direct memory access","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C200833197","wikidata":"https://www.wikidata.org/wiki/Q333707","display_name":"Compile time","level":3,"score":0.2540000081062317},{"id":"https://openalex.org/C1793878","wikidata":"https://www.wikidata.org/wiki/Q1153762","display_name":"Out-of-order execution","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/pact65351.2025.00046","is_oa":false,"landing_page_url":"https://doi.org/10.1109/pact65351.2025.00046","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 34th International Conference on Parallel Architectures and Compilation Techniques (PACT)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2511.14990","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.14990","pdf_url":"https://arxiv.org/pdf/2511.14990","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2511.14990","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2511.14990","pdf_url":"https://arxiv.org/pdf/2511.14990","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"data-intensive":[1],"applications":[2],"face":[3],"memory":[4,10,25,71,87,138],"latency":[5,164],"challenges":[6],"exacerbated":[7],"by":[8,85],"disaggregated":[9,156],"systems.":[11],"Recent":[12],"work":[13],"shows":[14],"that":[15,51,112],"coroutines":[16],"are":[17],"promising":[18],"in":[19],"effectively":[20],"interleaving":[21],"tasks":[22],"and":[23,58,89,101,144,161],"hiding":[24],"latency,":[26],"but":[27],"they":[28],"struggle":[29],"to":[30,79],"balance":[31],"latency-hiding":[32],"efficiency":[33],"with":[34,62,99,133],"runtime":[35],"overhead.":[36],"We":[37],"present":[38],"CoroAMU,":[39],"a":[40,63,90,117],"hardware-software":[41],"co-designed":[42],"system":[43],"for":[44],"memory-centric":[45],"coroutines.":[46],"It":[47,96],"introduces":[48],"compiler":[49,115],"procedures":[50],"optimize":[52],"coroutine":[53,83,125],"code":[54],"generation,":[55],"minimize":[56],"context,":[57],"coalesce":[59],"requests,":[60],"paired":[61],"simple":[64],"interface.":[65],"With":[66],"hardware":[67,135],"support":[68],"of":[69,136],"decoupled":[70,137],"operations,":[72],"we":[73],"enhance":[74],"the":[75,107,113,151],"Asynchronous":[76],"Memory":[77],"Unit":[78],"further":[80],"exploit":[81],"dynamic":[82],"schedulers":[84],"coroutine-specific":[86],"operations":[88],"novel":[91],"memory-guided":[92],"branch":[93],"prediction":[94],"mechanism.":[95],"is":[97],"implemented":[98],"LLVM":[100],"open-source":[102],"XiangShan":[103],"RISCV":[104],"processor":[105,153],"over":[106,123,150],"FPGA":[108],"platform.":[109],"Experiments":[110],"demonstrate":[111],"CoroAMU":[114],"achieves":[116],"$\\mathbf{1.":[118],"5":[119],"1}":[120],"\\boldsymbol{\\times}$":[121],"speedup":[122],"state-of-the-art":[124],"methods":[126],"on":[127,154],"Intel":[128],"server":[129],"processors.":[130],"When":[131],"combined":[132],"optimized":[134],"access,":[139],"it":[140],"delivers":[141],"$3.39":[142],"\\times$":[143,146],"$4.87":[145],"average":[147],"performance":[148],"improvements":[149],"baseline":[152],"FPGA-emulated":[155],"systems":[157],"under":[158],"200":[159],"ns":[160,163],"800":[162],"respectively.":[165]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-23T00:00:00"}
