{"id":"https://openalex.org/W7131641651","doi":"https://doi.org/10.48550/arxiv.2602.21371","title":"Interleaved Head Attention","display_name":"Interleaved Head Attention","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7131641651","doi":"https://doi.org/10.48550/arxiv.2602.21371"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.21371","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035307363","display_name":"Sai Surya Duvvuri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Duvvuri, Sai Surya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086331456","display_name":"Chanakya Ekbote","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ekbote, Chanakya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039871170","display_name":"Rachit Bansal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bansal, Rachit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126942968","display_name":"Rishabh Tiwari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tiwari, Rishabh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077706355","display_name":"Devvrit Khatri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Khatri, Devvrit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037978510","display_name":"David Brandfonbrener","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Brandfonbrener, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126855586","display_name":"Paul Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Paul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126856166","display_name":"Inderjit Dhillon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dhillon, Inderjit","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126910001","display_name":"Manzil Zaheer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zaheer, Manzil","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5035307363"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4357999861240387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.4357999861240387,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.2671000063419342,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.07999999821186066,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.644599974155426},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5961999893188477},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5896999835968018},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.48489999771118164},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.47040000557899475},{"id":"https://openalex.org/keywords/polynomial","display_name":"Polynomial","score":0.42669999599456787},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.3912000060081482},{"id":"https://openalex.org/keywords/mixing","display_name":"Mixing (physics)","score":0.3402999937534332}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7283999919891357},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.644599974155426},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5961999893188477},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5896999835968018},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.48489999771118164},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47780001163482666},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.47040000557899475},{"id":"https://openalex.org/C90119067","wikidata":"https://www.wikidata.org/wiki/Q43260","display_name":"Polynomial","level":2,"score":0.42669999599456787},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4065999984741211},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3912000060081482},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.38999998569488525},{"id":"https://openalex.org/C138777275","wikidata":"https://www.wikidata.org/wiki/Q6884054","display_name":"Mixing (physics)","level":2,"score":0.3402999937534332},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33340001106262207},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31130000948905945},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.30630001425743103},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.30480000376701355},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29190000891685486},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.27059999108314514},{"id":"https://openalex.org/C6802819","wikidata":"https://www.wikidata.org/wiki/Q1072174","display_name":"Linear system","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26109999418258667},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2605000138282776},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.25760000944137573},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C165743212","wikidata":"https://www.wikidata.org/wiki/Q104555","display_name":"Ruler","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.21371","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.21371","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.21371","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.21371","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multi-Head":[0],"Attention":[1,77],"(MHA)":[2],"is":[3,95],"the":[4,57,141,155],"core":[5],"computational":[6],"primitive":[7],"underlying":[8],"modern":[9],"Large":[10],"Language":[11],"Models":[12],"(LLMs).":[13],"However,":[14],"MHA":[15],"suffers":[16],"from":[17,53],"a":[18,65,96],"fundamental":[19],"linear":[20,98],"scaling":[21],"limitation:":[22],"$H$":[23,28,102],"attention":[24,30,38,119],"heads":[25,36,114,163],"produce":[26],"exactly":[27],"independent":[29],"matrices,":[31],"with":[32,123],"no":[33],"communication":[34],"between":[35,110],"during":[37],"computation.":[39],"This":[40],"becomes":[41],"problematic":[42],"for":[43,151,166,183],"multi-step":[44],"reasoning,":[45],"where":[46,91],"correct":[47],"answers":[48],"depend":[49],"on":[50,140,154,175,185],"aggregating":[51],"evidence":[52],"multiple":[54],"parts":[55],"of":[56,67,100,136,138],"context":[58],"and":[59,106,112,153,191],"composing":[60],"latent":[61],"token-to-token":[62],"relations":[63],"over":[64,197],"chain":[66],"intermediate":[68],"inferences.":[69],"To":[70],"address":[71],"this,":[72],"we":[73],"propose":[74],"Interleaved":[75],"Head":[76],"(IHA),":[78],"which":[79],"enables":[80],"cross-head":[81],"mixing":[82],"by":[83,177,189,193],"constructing":[84],"$P$":[85],"pseudo-heads":[86],"per":[87,121],"head":[88,122],"(typically":[89],"$P=H$),":[90],"each":[92],"pseudo":[93],"query/key/value":[94],"learned":[97],"combination":[99],"all":[101],"original":[103],"queries,":[104],"keys":[105],"values":[107],"respectively.":[108],"Interactions":[109],"pseudo-query":[111],"pseudo-key":[113],"induce":[115],"up":[116],"to":[117],"$P^2$":[118],"patterns":[120],"modest":[124],"parameter":[125],"overhead":[126],"$\\mathcal{O}(H^2P)$.":[127],"We":[128],"provide":[129],"theory":[130],"showing":[131],"improved":[132],"efficiency":[133],"in":[134],"terms":[135],"number":[137],"parameters":[139,148],"synthetic":[142,156],"Polynomial":[143],"task":[144,159],"(IHA":[145,160],"uses":[146,161],"$\u0398(\\sqrt{k}n^2)$":[147],"vs.":[149,164],"$\u0398(kn^2)$":[150],"MHA)":[152],"order-sensitive":[157],"CPM-3":[158],"$\\lceil\\sqrt{N_{\\max}}\\rceil$":[162],"$N_{\\max}$":[165],"MHA).":[167],"On":[168],"real-world":[169],"benchmarks,":[170],"IHA":[171],"improves":[172,187],"Multi-Key":[173],"retrieval":[174],"RULER":[176],"10-20%":[178],"(4k-16k)":[179],"and,":[180],"after":[181],"fine-tuning":[182],"reasoning":[184],"OpenThoughts,":[186],"GSM8K":[188],"5.8%":[190],"MATH-500":[192],"2.8%":[194],"(Majority":[195],"Vote)":[196],"full":[198],"attention.":[199]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-27T00:00:00"}
