{"id":"https://openalex.org/W7135028674","doi":"https://doi.org/10.48550/arxiv.2603.10026","title":"RedFuser: An Automatic Operator Fusion Framework for Cascaded Reductions on AI Accelerators","display_name":"RedFuser: An Automatic Operator Fusion Framework for Cascaded Reductions on AI Accelerators","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7135028674","doi":"https://doi.org/10.48550/arxiv.2603.10026"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.10026","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10026","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.10026","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128797846","display_name":"Xinsheng Tang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Tang, Xinsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128732734","display_name":"Yangcheng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yangcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128876813","display_name":"Nan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Nan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021602251","display_name":"ZhiYi Shu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shu, Zhiyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128750379","display_name":"Xingyu Ling","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Xingyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035512142","display_name":"Junna Xing","orcid":"https://orcid.org/0000-0001-6752-5678"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Junna","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128848876","display_name":"Peng Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128893843","display_name":"Qiang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Qiang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5128797846"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23549999296665192,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23549999296665192,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.11219999939203262,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.0820000022649765,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.6292999982833862},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5665000081062317},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.5392000079154968},{"id":"https://openalex.org/keywords/fuse","display_name":"Fuse (electrical)","score":0.4986000061035156},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4945000112056732},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4839000105857849},{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.44350001215934753},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.44130000472068787},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.43549999594688416}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7821999788284302},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.6292999982833862},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5665000081062317},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.5392000079154968},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.4986000061035156},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4945000112056732},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4839000105857849},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.44350001215934753},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.44130000472068787},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.43549999594688416},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4318000078201294},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.42329999804496765},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38109999895095825},{"id":"https://openalex.org/C82653869","wikidata":"https://www.wikidata.org/wiki/Q6675821","display_name":"Loop fusion","level":3,"score":0.3779999911785126},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.3578000068664551},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3474999964237213},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3440000116825104},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.322299987077713},{"id":"https://openalex.org/C2780767217","wikidata":"https://www.wikidata.org/wiki/Q5532421","display_name":"Generality","level":2,"score":0.30550000071525574},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30469998717308044},{"id":"https://openalex.org/C2778971668","wikidata":"https://www.wikidata.org/wiki/Q5510284","display_name":"Fusion rules","level":4,"score":0.3005000054836273},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2892000079154968},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.2799000144004822},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C2778820799","wikidata":"https://www.wikidata.org/wiki/Q3454688","display_name":"Cost reduction","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27090001106262207}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.10026","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10026","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.10026","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.10026","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Operator":[0],"fusion,":[1],"as":[2,40],"a":[3,113,126,144],"key":[4],"performance":[5,180],"optimization":[6],"technique":[7],"in":[8,23,75,92,101],"the":[9,41,86,179],"deployment":[10],"of":[11,88,181],"AI":[12,25,175],"models,":[13,95],"significantly":[14],"improves":[15],"execution":[16],"efficiency":[17],"and":[18,56,77,104,129,153,177],"has":[19],"been":[20],"widely":[21],"adopted":[22],"modern":[24],"compilers.":[26],"However,":[27],"for":[28,117],"cascaded":[29,119,150],"reduction":[30,151],"operations":[31],"involving":[32],"multiple":[33],"loops":[34],"with":[35],"inter-loop":[36],"data":[37],"dependencies,":[38],"such":[39,89],"safe":[42],"softmax":[43],"followed":[44],"by":[45],"GEMM":[46],"within":[47],"attention":[48],"mechanisms,":[49],"existing":[50],"compilers":[51,176],"lack":[52],"effective":[53],"automated":[54,105],"fusion":[55,69,106],"kernel":[57],"generation":[58],"capabilities.":[59],"Although":[60],"some":[61],"works":[62],"have":[63],"addressed":[64],"specific":[65],"instances":[66],"through":[67],"hand-crafted":[68],"strategies,":[70],"their":[71],"solutions":[72],"are":[73],"limited":[74],"generality":[76],"difficult":[78],"to":[79,81,168,170],"extend":[80],"other":[82],"similar":[83],"structures.":[84],"Given":[85],"prevalence":[87],"computational":[90],"patterns":[91,152],"deep":[93],"learning":[94],"there":[96],"remains":[97],"significant":[98],"untapped":[99],"potential":[100],"achieving":[102,166],"general":[103],"optimization.":[107],"In":[108],"this":[109,137],"paper,":[110],"we":[111,139],"present":[112],"formal":[114],"theoretical":[115],"methodology":[116],"analyzing":[118],"reductions":[120],"which":[121],"can":[122],"fuse":[123],"them":[124],"into":[125],"single":[127],"loop":[128],"introduce":[130],"an":[131],"incremental":[132],"computation":[133],"form.":[134],"Based":[135],"on":[136],"methodology,":[138],"design":[140],"Reduction":[141],"Fuser":[142],"(RedFuser),":[143],"framework":[145],"that":[146,160],"automatically":[147],"identifies":[148],"supported":[149],"generates":[154],"optimized":[155,183],"fused":[156],"kernels.":[157,185],"Experiments":[158],"show":[159],"RedFuser":[161],"successfully":[162],"fuses":[163],"diverse":[164],"workloads,":[165],"up":[167],"2$\\times$":[169],"5$\\times$":[171],"speedup":[172],"over":[173],"state-of-the-art":[174],"matching":[178],"highly":[182],"hand-written":[184],"The":[186],"code":[187],"is":[188],"available":[189],"at":[190],"https://github.com/alibaba/redfuser":[191]},"counts_by_year":[],"updated_date":"2026-03-13T14:25:03.468858","created_date":"2026-03-13T00:00:00"}
