{"id":"https://openalex.org/W7138063307","doi":"https://doi.org/10.48550/arxiv.2603.15530","title":"DUET: Disaggregated Hybrid Mamba-Transformer LLMs with Prefill and Decode-Specific Packages","display_name":"DUET: Disaggregated Hybrid Mamba-Transformer LLMs with Prefill and Decode-Specific Packages","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138063307","doi":"https://doi.org/10.48550/arxiv.2603.15530"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15530","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006318342","display_name":"Alish Kanani","orcid":"https://orcid.org/0009-0000-8585-9241"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kanani, Alish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129738052","display_name":"Sangwan Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Sangwan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129672139","display_name":"Han Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129694942","display_name":"Jiahao Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129640803","display_name":"Jaehyun Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Jaehyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5084255924","display_name":"\u00dcmit Y. Ogras","orcid":"https://orcid.org/0000-0002-5045-5535"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ogras, Umit Y.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5006318342"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8270000219345093,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8270000219345093,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.03689999878406525,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.02319999970495701,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.5777000188827515},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.41749998927116394},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.4153999984264374},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.3732999861240387},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.367900013923645},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.3555000126361847},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.34769999980926514}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7124999761581421},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.5777000188827515},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4778999984264374},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.41749998927116394},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.4153999984264374},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3732999861240387},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3555000126361847},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.34769999980926514},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.34700000286102295},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C50897621","wikidata":"https://www.wikidata.org/wiki/Q2665508","display_name":"Hybrid system","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C72434380","wikidata":"https://www.wikidata.org/wiki/Q230930","display_name":"State space","level":2,"score":0.3027999997138977},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.29670000076293945},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2897999882698059},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2775000035762787},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.26100000739097595},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15530","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15530","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,16,107],"operate":[3],"in":[4],"distinct":[5],"compute-bound":[6],"prefill":[7,58],"followed":[8],"by":[9],"memory":[10,74,92],"bandwidth-bound":[11],"decode":[12,60],"phases.":[13],"Hybrid":[14],"Mamba-Transformer":[15],"inherit":[17],"this":[18],"asymmetry":[19],"while":[20],"adding":[21],"state":[22],"space":[23],"model":[24],"(SSM)":[25],"recurrences":[26],"and":[27,59,80,97,111,118,136],"element-wise":[28],"operations":[29],"that":[30,42,56,124],"map":[31],"poorly":[32],"to":[33,62,93,104,130],"matmul-centric":[34],"accelerators.":[35],"This":[36],"mismatch":[37],"causes":[38],"performance":[39],"bottlenecks,":[40],"showing":[41],"a":[43,53],"homogeneous":[44],"architecture":[45],"cannot":[46],"satisfy":[47],"all":[48],"requirements.":[49],"We":[50],"introduce":[51],"DUET,":[52],"disaggregated":[54],"accelerator":[55],"assigns":[57],"phases":[61],"specialized":[63],"packages.":[64],"The":[65,83],"Prefill":[66],"package":[67,85],"utilizes":[68,86],"systolic":[69],"array":[70],"chiplets":[71],"with":[72,89,108],"off-package":[73],"for":[75],"efficient":[76],"large":[77],"matrix":[78],"multiplications":[79],"long-sequence":[81],"SSMs.":[82],"Decode":[84],"vector-unit":[87],"arrays":[88],"high-bandwidth":[90],"in-package":[91],"accelerate":[94],"token-by-token":[95],"SSM":[96],"vector-matrix":[98],"multiplications.":[99],"Both":[100],"architectures":[101],"are":[102],"runtime-configurable":[103],"support":[105],"hybrid":[106],"mixed":[109],"Mamba":[110],"attention":[112],"layers.":[113],"Evaluations":[114],"on":[115],"Nemotron-H-56B,":[116],"Zamba2-7B,":[117],"Llama3-8B":[119],"across":[120],"four":[121],"workloads":[122],"show":[123],"DUET":[125],"achieves":[126],"4x":[127],"faster":[128],"time":[129,139],"first":[131],"token,":[132],"1.4x":[133],"higher":[134],"throughput,":[135],"1.5x":[137],"lower":[138],"between":[140],"tokens":[141],"over":[142],"the":[143],"B200":[144],"GPU.":[145]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
