{"id":"https://openalex.org/W7134937293","doi":"https://doi.org/10.1145/3779212.3790209","title":"RedFuser: An Automatic Operator Fusion Framework for Cascaded Reductions on AI Accelerators","display_name":"RedFuser: An Automatic Operator Fusion Framework for Cascaded Reductions on AI Accelerators","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134937293","doi":"https://doi.org/10.1145/3779212.3790209"},"language":null,"primary_location":{"id":"doi:10.1145/3779212.3790209","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790209","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3779212.3790209","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128797846","display_name":"Xinsheng Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xinsheng Tang","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0008-8826-3775","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128732734","display_name":"Yangcheng Li","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yangcheng Li","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-8902-0442","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101816252","display_name":"Nan Wang","orcid":"https://orcid.org/0009-0002-8276-1868"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Nan Wang","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0002-8276-1868","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021602251","display_name":"ZhiYi Shu","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyi Shu","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0002-2789-3630","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128750379","display_name":"Xingyu Ling","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingyu Ling","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-9854-5149","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Junna Xing","orcid":"https://orcid.org/0009-0005-9238-0745"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Junna Xing","raw_affiliation_strings":["Alibaba Cloud Computing, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-9238-0745","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shanghai, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103090571","display_name":"Peng Zhou","orcid":"https://orcid.org/0009-0000-0869-9782"},"institutions":[{"id":"https://openalex.org/I4210095624","display_name":"Alibaba Group (United States)","ror":"https://ror.org/00rn0m335","country_code":"US","type":"company","lineage":["https://openalex.org/I4210095624","https://openalex.org/I45928872"]},{"id":"https://openalex.org/I4210144487","display_name":"Cloud Computing Center","ror":"https://ror.org/04aa0zm65","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210144487"]},{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN","US"],"is_corresponding":false,"raw_author_name":"Peng Zhou","raw_affiliation_strings":["Alibaba Cloud Computing, Sunnyvale, USA"],"raw_orcid":"https://orcid.org/0009-0000-0869-9782","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Sunnyvale, USA","institution_ids":["https://openalex.org/I4210095624","https://openalex.org/I4210144487","https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5128736532","display_name":"Qiang Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210144487","display_name":"Cloud Computing Center","ror":"https://ror.org/04aa0zm65","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210144487"]},{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiang Liu","raw_affiliation_strings":["Alibaba Cloud Computing, Shenzhen, China"],"raw_orcid":"https://orcid.org/0009-0006-5792-322X","affiliations":[{"raw_affiliation_string":"Alibaba Cloud Computing, Shenzhen, China","institution_ids":["https://openalex.org/I4210144487","https://openalex.org/I45928872"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128797846"],"corresponding_institution_ids":["https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.43508597,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1566","last_page":"1588"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2786000072956085,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2786000072956085,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.10249999910593033,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.09570000320672989,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5232999920845032},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5217999815940857},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5174999833106995},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.5117999911308289},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.5094000101089478},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.5058000087738037},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.41929998993873596},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.4041999876499176},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4004000127315521}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7919999957084656},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5232999920845032},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5217999815940857},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5174999833106995},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.5117999911308289},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.5094000101089478},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.5058000087738037},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.41929998993873596},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.4041999876499176},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4004000127315521},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39719998836517334},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.382999986410141},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C2780767217","wikidata":"https://www.wikidata.org/wiki/Q5532421","display_name":"Generality","level":2,"score":0.3529999852180481},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.3513000011444092},{"id":"https://openalex.org/C82653869","wikidata":"https://www.wikidata.org/wiki/Q6675821","display_name":"Loop fusion","level":3,"score":0.3328000009059906},{"id":"https://openalex.org/C24858836","wikidata":"https://www.wikidata.org/wiki/Q844718","display_name":"Theory of computation","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3109000027179718},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3109000027179718},{"id":"https://openalex.org/C2778971668","wikidata":"https://www.wikidata.org/wiki/Q5510284","display_name":"Fusion rules","level":4,"score":0.3100999891757965},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.27790001034736633},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2759999930858612},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2630000114440918},{"id":"https://openalex.org/C111498074","wikidata":"https://www.wikidata.org/wiki/Q173326","display_name":"Formal verification","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3779212.3790209","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790209","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2603.10026","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2603.10026","pdf_url":"https://arxiv.org/pdf/2603.10026","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3779212.3790209","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790209","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W2023415862","https://openalex.org/W2417863416","https://openalex.org/W2808498336","https://openalex.org/W2935331687","https://openalex.org/W2954698171","https://openalex.org/W2979434206","https://openalex.org/W3012872813","https://openalex.org/W3044913359","https://openalex.org/W3047237654","https://openalex.org/W3092664912","https://openalex.org/W3174529902","https://openalex.org/W4212986322","https://openalex.org/W4360831828","https://openalex.org/W4394998532","https://openalex.org/W4404133600","https://openalex.org/W4405756380"],"related_works":[],"abstract_inverted_index":{"Operator":[0],"fusion,":[1],"as":[2,40],"a":[3,113,126,145],"key":[4],"performance":[5,181],"optimization":[6],"technique":[7],"in":[8,23,75,92,101],"the":[9,41,86,180],"deployment":[10],"of":[11,88,182],"AI":[12,25,176],"models,":[13,95],"significantly":[14],"improves":[15],"execution":[16],"efficiency":[17],"and":[18,56,77,104,129,154,178],"has":[19],"been":[20],"widely":[21],"adopted":[22],"modern":[24],"compilers.":[26],"However,":[27],"for":[28,117],"cascaded":[29,119,151],"reduction":[30,152],"operations":[31],"involving":[32],"multiple":[33],"loops":[34],"with":[35],"inter-loop":[36],"data":[37],"dependencies,":[38],"such":[39,89],"safe":[42],"softmax":[43],"followed":[44],"by":[45],"GEMM":[46],"within":[47],"attention":[48],"mechanisms,":[49],"existing":[50],"compilers":[51,177],"lack":[52],"effective":[53],"automated":[54,105],"fusion":[55,69,106],"kernel":[57],"generation":[58],"capabilities.":[59],"Although":[60],"some":[61],"works":[62],"have":[63],"addressed":[64],"specific":[65],"instances":[66],"through":[67],"hand-crafted":[68],"strategies,":[70],"their":[71],"solutions":[72],"are":[73],"limited":[74],"generality":[76],"difficult":[78],"to":[79,81,169,171],"extend":[80],"other":[82],"similar":[83],"structures.":[84],"Given":[85],"prevalence":[87],"computational":[90],"patterns":[91,153],"deep":[93],"learning":[94],"there":[96],"remains":[97],"significant":[98],"untapped":[99],"potential":[100],"achieving":[102,167],"general":[103],"optimization.":[107],"In":[108],"this":[109,137],"paper,":[110],"we":[111,139],"present":[112],"formal":[114],"theoretical":[115],"methodology":[116],"analyzing":[118],"reductions":[120],"which":[121],"can":[122],"fuse":[123],"them":[124],"into":[125],"single":[127],"loop":[128],"introduce":[130],"an":[131],"incremental":[132],"computation":[133],"form.":[134],"Based":[135],"on":[136],"methodology,":[138],"design":[140],"Red":[141],"uction":[142],"Fuser":[143],"(RedFuser),":[144],"framework":[146],"that":[147,161],"automatically":[148],"identifies":[149],"supported":[150],"generates":[155],"optimized":[156,184],"fused":[157],"kernels.":[158,186],"Experiments":[159],"show":[160],"RedFuser":[162],"successfully":[163],"fuses":[164],"diverse":[165],"workloads,":[166],"up":[168],"2\u00d7":[170],"5\u00d7":[172],"speedup":[173],"over":[174],"state-of-the-art":[175],"matching":[179],"highly":[183],"hand-written":[185]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2026-03-12T00:00:00"}
