{"id":"https://openalex.org/W4399657245","doi":"https://doi.org/10.48550/arxiv.2406.07887","title":"An Empirical Study of Mamba-based Language Models","display_name":"An Empirical Study of Mamba-based Language Models","publication_year":2024,"publication_date":"2024-06-12","ids":{"openalex":"https://openalex.org/W4399657245","doi":"https://doi.org/10.48550/arxiv.2406.07887"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2406.07887","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.07887","pdf_url":"https://arxiv.org/pdf/2406.07887","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.07887","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5039817377","display_name":"Roger Waleffe","orcid":"https://orcid.org/0000-0002-3795-4997"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Waleffe, Roger","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025693806","display_name":"Wonmin Byeon","orcid":"https://orcid.org/0000-0002-4780-4749"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Byeon, Wonmin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099124114","display_name":"Duncan Riach","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Riach, Duncan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048591183","display_name":"Brandon Norick","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Norick, Brandon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085834120","display_name":"Vijay Anand Korthikanti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Korthikanti, Vijay","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091734792","display_name":"Tri Dao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dao, Tri","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025386668","display_name":"Albert Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Albert","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028525720","display_name":"Ali Hatamizadeh","orcid":"https://orcid.org/0000-0002-5349-1996"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hatamizadeh, Ali","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083716814","display_name":"Sudhakar Singh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Sudhakar","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079514101","display_name":"Deepak Narayanan","orcid":"https://orcid.org/0000-0002-3020-2848"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narayanan, Deepak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099124113","display_name":"Garvit Kulshreshtha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kulshreshtha, Garvit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101530882","display_name":"Vartika Singh","orcid":"https://orcid.org/0000-0002-9291-6221"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Singh, Vartika","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010396318","display_name":"Jared Casper","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Casper, Jared","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063511312","display_name":"Jan Kautz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kautz, Jan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072436307","display_name":"Mohammad Shoeybi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shoeybi, Mohammad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066242985","display_name":"Bryan Catanzaro","orcid":"https://orcid.org/0000-0003-0034-7728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Catanzaro, Bryan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":16,"corresponding_author_ids":["https://openalex.org/A5039817377"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13912","display_name":"Language, Linguistics, Cultural Analysis","score":0.6866000294685364,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T13912","display_name":"Language, Linguistics, Cultural Analysis","score":0.6866000294685364,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.39409688115119934}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.39409688115119934}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2406.07887","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.07887","pdf_url":"https://arxiv.org/pdf/2406.07887","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"doi:10.48550/arxiv.2406.07887","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2406.07887","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2406.07887","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2406.07887","pdf_url":"https://arxiv.org/pdf/2406.07887","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Selective":[0],"state-space":[1],"models":[2,96,111,140,280],"(SSMs)":[3],"like":[4],"Mamba":[5,139],"overcome":[6],"some":[7],"of":[8,11,45,79,102,117,131,137,231,283],"the":[9,27,41,75,99,135,188,192,232,250,259,269,274],"shortcomings":[10],"Transformers,":[12,46],"such":[13],"as":[14,271,273,281],"quadratic":[15],"computational":[16],"complexity":[17],"with":[18],"sequence":[19],"length":[20],"and":[21,77,94,122,206,234,241],"large":[22],"inference-time":[23],"memory":[24],"requirements":[25],"from":[26],"key-value":[28],"cache.":[29],"Moreover,":[30],"recent":[31],"studies":[32,60],"have":[33,63],"shown":[34],"that":[35,151,187],"SSMs":[36,70,154],"can":[37,141],"match":[38,142,155,256],"or":[39,156,172,180,257],"exceed":[40,157,258],"language":[42],"modeling":[43],"capabilities":[44],"making":[47],"them":[48],"an":[49,245],"attractive":[50],"alternative.":[51],"In":[52,183],"a":[53,87,113,128],"controlled":[54],"setting":[55],"(e.g.,":[56,176],"same":[57,100],"data),":[58],"however,":[59],"so":[61],"far":[62],"only":[64],"presented":[65],"small":[66],"scale":[67],"experiments":[68,228],"comparing":[69],"to":[71,104,112,209,212,237,254,277],"Transformers.":[72],"To":[73,221,263],"understand":[74],"strengths":[76],"weaknesses":[78],"these":[80,110],"architectures":[81],"at":[82,144,218],"larger":[83,145],"scales,":[84],"we":[85,133,185,200,225,267],"present":[86],"direct":[88],"comparison":[89],"between":[90],"8B-parameter":[91],"Mamba,":[92],"Mamba-2,":[93,119],"Transformer":[95,194,235,260],"trained":[97],"on":[98,159,166,195,204,261],"datasets":[101],"up":[103,211],"3.5T":[105],"tokens.":[106],"We":[107],"also":[108],"compare":[109],"hybrid":[114,251],"architecture":[115],"consisting":[116],"43%":[118],"7%":[120],"attention,":[121],"50%":[123],"MLP":[124],"layers":[125],"(Mamba-2-Hybrid).":[126],"Using":[127],"diverse":[129],"set":[130],"tasks,":[132,161,249],"answer":[134],"question":[136],"whether":[138],"Transformers":[143,158,165],"training":[146],"budgets.":[147],"Our":[148],"results":[149],"show":[150],"while":[152],"pure":[153],"many":[160],"they":[162],"lag":[163],"behind":[164],"tasks":[167,199],"which":[168],"require":[169],"strong":[170],"copying":[171],"in-context":[173],"learning":[174],"abilities":[175],"5-shot":[177],"MMLU,":[178],"Phonebook)":[179],"long-context":[181,223,248],"reasoning.":[182],"contrast,":[184],"find":[186],"8B":[189,193],"Mamba-2-Hybrid":[190,233],"exceeds":[191],"all":[196],"12":[197],"standard":[198],"evaluated":[201],"(+2.65":[202],"points":[203],"average)":[205],"is":[207],"predicted":[208],"be":[210],"8x":[213],"faster":[214],"when":[215],"generating":[216],"tokens":[217],"inference":[219],"time.":[220],"validate":[222],"capabilities,":[224],"provide":[226],"additional":[227,246],"evaluating":[229],"variants":[230],"extended":[236],"support":[238],"16K,":[239],"32K,":[240],"128K":[242],"sequences.":[243],"On":[244],"23":[247],"model":[252],"continues":[253],"closely":[255],"average.":[262],"enable":[264],"further":[265],"study,":[266],"release":[268],"checkpoints":[270],"well":[272],"code":[275],"used":[276],"train":[278],"our":[279],"part":[282],"NVIDIA's":[284],"Megatron-LM":[285],"project.":[286]},"counts_by_year":[{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":2}],"updated_date":"2026-05-04T08:30:34.212998","created_date":"2025-10-10T00:00:00"}
