{"id":"https://openalex.org/W7154711729","doi":"https://doi.org/10.48550/arxiv.2604.14191","title":"Attention to Mamba: A Recipe for Cross-Architecture Distillation","display_name":"Attention to Mamba: A Recipe for Cross-Architecture Distillation","publication_year":2026,"publication_date":"2026-04-01","ids":{"openalex":"https://openalex.org/W7154711729","doi":"https://doi.org/10.48550/arxiv.2604.14191"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14191","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119950580","display_name":"Abhinav Moudgil","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Moudgil, Abhinav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109717247","display_name":"Ningyuan Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Ningyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044545212","display_name":"Eeshan Gunesh Dhekane","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dhekane, Eeshan Gunesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103191934","display_name":"Pau Rodr\u00edguez","orcid":"https://orcid.org/0000-0002-1689-8084"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rodr\u00edguez, Pau","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043365442","display_name":"Luca Zappella","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zappella, Luca","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5035740107","display_name":"Federico Danieli","orcid":"https://orcid.org/0000-0002-0866-2485"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Danieli, Federico","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5119950580"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23690000176429749,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23690000176429749,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10142","display_name":"Formal Methods in Verification","score":0.06509999930858612,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.05559999868273735,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.842199981212616},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7534000277519226},{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.5975000262260437},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.4618000090122223},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.4343000054359436}],"concepts":[{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.842199981212616},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7534000277519226},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6398000121116638},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.5975000262260437},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47429999709129333},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.4618000090122223},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.4343000054359436},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42750000953674316},{"id":"https://openalex.org/C2777767291","wikidata":"https://www.wikidata.org/wiki/Q1080291","display_name":"Sizing","level":2,"score":0.37950000166893005},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14191","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14191","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.49028879404067993}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"State":[0],"Space":[1],"Models":[2],"(SSMs)":[3],"such":[4],"as":[5],"Mamba":[6,104,134,188,200],"have":[7],"become":[8],"a":[9,40,82,97,112,136,154,163,167,215,255],"popular":[10],"alternative":[11],"to":[12,16,27,47,71,76,103,106,204,220],"Transformer":[13,53,165,209],"models,":[14],"due":[15],"their":[17,28],"reduced":[18],"memory":[19],"consumption":[20],"and":[21,50,121,250,254],"higher":[22],"throughput":[23],"at":[24,235],"generation":[25],"compared":[26],"Attention-based":[29,79],"counterparts.":[30],"On":[31],"the":[32,35,60,108,176,182,198,206,221,226],"other":[33],"hand,":[34],"community":[36],"has":[37,93],"built":[38],"up":[39],"considerable":[41],"body":[42],"of":[43,62,170,175,217,228],"knowledge":[44,161],"on":[45,88,247,258],"how":[46],"train":[48],"Transformers,":[49,68],"many":[51],"pretrained":[52,67],"models":[54],"are":[55],"readily":[56],"available.":[57],"To":[58,149,224],"facilitate":[59],"adoption":[61],"SSMs":[63],"while":[64],"leveraging":[65],"existing":[66],"we":[69,139,152,159,180,231],"aim":[70],"identify":[72],"an":[73,78,142,173,186],"effective":[74],"recipe":[75,145],"distill":[77,160,181],"model":[80,189,201,248],"into":[81,166,185],"Mamba-like":[83],"architecture.":[84],"In":[85],"prior":[86],"work":[87,129],"cross-architecture":[89],"distillation,":[90],"however,":[91],"it":[92],"been":[94],"shown":[95],"that":[96,190],"na\u00efve":[98],"distillation":[99,252],"procedure":[100],"from":[101,127,162],"Transformers":[102],"fails":[105],"preserve":[107,205],"original":[109,207],"teacher":[110],"performance,":[111],"limitation":[113],"often":[114],"overcome":[115],"with":[116,135,238],"hybrid":[117],"solutions":[118],"combining":[119],"Attention":[120,195],"SSM":[122],"blocks.":[123],"The":[124],"key":[125],"argument":[126],"our":[128,229],"is":[130,202],"that,":[131],"by":[132],"equipping":[133],"principled":[137,155],"initialization,":[138],"can":[140],"recover":[141],"overall":[143],"better":[144],"for":[146],"cross-architectural":[147],"distillation.":[148],"this":[150],"end,":[151],"propose":[153],"two-stage":[156],"approach:":[157],"first,":[158],"traditional":[164],"linearized":[168,183],"version":[169,184],"Attention,":[171],"using":[172],"adaptation":[174],"kernel":[177],"trick.":[178],"Then,":[179],"adapted":[187],"does":[191],"not":[192],"use":[193],"any":[194],"block.":[196],"Overall,":[197],"distilled":[199],"able":[203],"Pythia-1B":[208],"performance":[210],"in":[211],"downstream":[212],"tasks,":[213],"maintaining":[214],"perplexity":[216],"14.11":[218],"close":[219],"teacher's":[222],"13.86.":[223],"show":[225],"efficacy":[227],"recipe,":[230],"conduct":[232],"thorough":[233],"ablations":[234],"1B":[236],"scale":[237],"10B":[239],"tokens":[240,259],"varying":[241],"sequence":[242],"mixer":[243],"architecture,":[244],"scaling":[245],"analysis":[246,257],"sizes":[249],"total":[251],"tokens,":[253],"sensitivity":[256],"allocation":[260],"between":[261],"stages.":[262]},"counts_by_year":[],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2026-04-18T00:00:00"}
