{"id":"https://openalex.org/W4415252515","doi":"https://doi.org/10.48550/arxiv.2509.16686","title":"EG-MLA: Embedding-Gated Multi-head Latent Attention for Scalable and Efficient LLMs","display_name":"EG-MLA: Embedding-Gated Multi-head Latent Attention for Scalable and Efficient LLMs","publication_year":2025,"publication_date":"2025-09-20","ids":{"openalex":"https://openalex.org/W4415252515","doi":"https://doi.org/10.48550/arxiv.2509.16686"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.16686","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.16686","pdf_url":"https://arxiv.org/pdf/2509.16686","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.16686","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103086131","display_name":"Zhijun Cai","orcid":"https://orcid.org/0000-0003-3728-8100"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cai, Zhengge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5068420027","display_name":"Haowen Hou","orcid":"https://orcid.org/0000-0003-3850-3722"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Haowen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5103086131"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.986299991607666,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9465000033378601,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6941999793052673},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.5807999968528748},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5493999719619751},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5325000286102295},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.42250001430511475},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.38429999351501465},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.37279999256134033},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.336899995803833}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8242999911308289},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6941999793052673},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.5807999968528748},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5325000286102295},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.42250001430511475},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.37279999256134033},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3684999942779541},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3537999987602234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3440000116825104},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.2985999882221222},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C166052673","wikidata":"https://www.wikidata.org/wiki/Q83021","display_name":"Empirical evidence","level":2,"score":0.26919999718666077},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.26829999685287476},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C120936955","wikidata":"https://www.wikidata.org/wiki/Q2155640","display_name":"Empirical research","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.26409998536109924},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2581000030040741}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.16686","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.16686","pdf_url":"https://arxiv.org/pdf/2509.16686","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.16686","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.16686","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.16686","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.16686","pdf_url":"https://arxiv.org/pdf/2509.16686","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Reducing":[0],"the":[1,73,117],"key-value":[2],"(KV)":[3],"cache":[4,63,71,102,141],"size":[5,103,142],"is":[6],"a":[7,52,57,93,110,213],"crucial":[8],"step":[9],"toward":[10],"enabling":[11,120],"efficient":[12],"inference":[13,223],"in":[14,116,139,224],"large":[15],"language":[16],"models":[17],"(LLMs),":[18],"especially":[19],"under":[20],"latency":[21],"and":[22,62,178,187,215],"memory":[23,36,165],"constraints.":[24],"While":[25,65],"Multi-Head":[26],"Attention":[27,43,91],"(MHA)":[28],"offers":[29],"strong":[30],"representational":[31,106],"power,":[32],"it":[33],"incurs":[34],"significant":[35,69],"overhead.":[37],"Recent":[38],"work":[39],"on":[40],"Multi-head":[41,89],"Latent":[42,90],"(MLA)":[44],"mitigates":[45],"this":[46,84],"by":[47],"compressing":[48],"KV":[49,70,101,125,140],"representations":[50],"into":[51],"shared":[53],"latent":[54,118],"space,":[55,119],"achieving":[56,160],"better":[58],"trade-off":[59],"between":[60],"performance":[61,81,145],"efficiency.":[64],"MLA":[66,97],"already":[67],"achieves":[68,135],"reduction,":[72],"scope":[74],"for":[75,204],"further":[76,99],"compression":[77,188],"remains":[78],"limited":[79],"without":[80],"loss.":[82],"In":[83],"paper,":[85],"we":[86,191],"propose":[87],"\\textbf{Embedding-Gated":[88],"(EG-MLA)},":[92],"novel":[94],"extension":[95],"of":[96,123],"that":[98,219],"reduces":[100],"while":[104,159],"enhancing":[105],"expressiveness.":[107],"EG-MLA":[108,134,150,194,211],"introduces":[109],"token-specific":[111],"embedding":[112,172],"gating":[113,173],"mechanism":[114,218],"applied":[115],"fine-grained":[121],"modulation":[122],"compressed":[124],"vectors":[126],"with":[127,143],"minimal":[128],"additional":[129,164],"computation.":[130],"Compared":[131],"to":[132,148,162,195],"MHA,":[133],"over":[136,196],"91.6\\%":[137],"reduction":[138],"negligible":[144],"degradation.":[146],"Relative":[147],"MLA,":[149],"consistently":[151],"improves":[152],"task":[153],"accuracy":[154],"across":[155,184],"diverse":[156],"reasoning":[157],"benchmarks":[158],"up":[161],"59.9\\%":[163],"savings.":[166],"Our":[167],"theoretical":[168],"analysis":[169],"highlights":[170],"how":[171],"induces":[174],"implicit":[175],"high-order":[176],"interactions,":[177],"empirical":[179],"evaluations":[180],"demonstrate":[181],"robust":[182],"generalization":[183],"model":[185],"scales":[186],"regimes.":[189],"Notably,":[190],"successfully":[192],"scale":[193],"1":[197],"billion":[198],"parameters,":[199],"demonstrating":[200],"its":[201],"practical":[202],"viability":[203],"large-scale":[205],"LLM":[206],"deployment.":[207],"These":[208],"results":[209],"establish":[210],"as":[212],"memory-":[214],"compute-efficient":[216],"attention":[217],"enables":[220],"scalable,":[221],"high-performance":[222],"modern":[225],"LLMs.":[226]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-16T00:00:00"}
