{"id":"https://openalex.org/W7151352031","doi":"https://doi.org/10.48550/arxiv.2604.03444","title":"Olmo Hybrid: From Theory to Practice and Back","display_name":"Olmo Hybrid: From Theory to Practice and Back","publication_year":2026,"publication_date":"2026-04-03","ids":{"openalex":"https://openalex.org/W7151352031","doi":"https://doi.org/10.48550/arxiv.2604.03444"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.03444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.03444","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133119031","display_name":"William Merrill","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Merrill, William","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133082795","display_name":"Yanhong Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yanhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120845550","display_name":"Tyler Romero","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Romero, Tyler","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071101707","display_name":"Anej Svete","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Svete, Anej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119284903","display_name":"Caia Costello","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Costello, Caia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029074038","display_name":"Pradeep Dasigi","orcid":"https://orcid.org/0000-0001-7127-1316"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dasigi, Pradeep","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059265033","display_name":"Dirk Groeneveld","orcid":"https://orcid.org/0000-0002-8274-768X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Groeneveld, Dirk","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133079851","display_name":"David Heineman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Heineman, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037115259","display_name":"Bailey Kuehl","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kuehl, Bailey","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083548887","display_name":"Nathan Lambert","orcid":"https://orcid.org/0000-0002-9997-6817"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lambert, Nathan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133138804","display_name":"Jacob Morrison","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133110226","display_name":"Luca Soldaini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lo, Kyle","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Malik, Saumya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Malik, Saumya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Matusz, DJ","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matusz, DJ","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Minixhofer, Benjamin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Minixhofer, Benjamin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Morrison, Jacob","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Morrison, Jacob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Soldaini, Luca","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soldaini, Luca","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Timbers, Finbarr","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Timbers, Finbarr","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Walsh, Pete","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Walsh, Pete","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Smith, Noah A.","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Smith, Noah A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hajishirzi, Hannaneh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hajishirzi, Hannaneh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Sabharwal, Ashish","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sabharwal, Ashish","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":22,"corresponding_author_ids":["https://openalex.org/A5133119031"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.29760000109672546,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.29760000109672546,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.07329999655485153,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.062199998646974564,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6284999847412109},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.5509999990463257},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4478999972343445},{"id":"https://openalex.org/keywords/extension","display_name":"Extension (predicate logic)","score":0.37880000472068787},{"id":"https://openalex.org/keywords/clarity","display_name":"CLARITY","score":0.36809998750686646},{"id":"https://openalex.org/keywords/dual","display_name":"Dual (grammatical number)","score":0.35339999198913574},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.3197000026702881}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6978999972343445},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6284999847412109},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.5509999990463257},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4478999972343445},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4352000057697296},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38690000772476196},{"id":"https://openalex.org/C2778029271","wikidata":"https://www.wikidata.org/wiki/Q5421931","display_name":"Extension (predicate logic)","level":2,"score":0.37880000472068787},{"id":"https://openalex.org/C2777146004","wikidata":"https://www.wikidata.org/wiki/Q14949826","display_name":"CLARITY","level":2,"score":0.36809998750686646},{"id":"https://openalex.org/C2780980858","wikidata":"https://www.wikidata.org/wiki/Q110022","display_name":"Dual (grammatical number)","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.3197000026702881},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3154999911785126},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.313400000333786},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.30320000648498535},{"id":"https://openalex.org/C92811239","wikidata":"https://www.wikidata.org/wiki/Q20998670","display_name":"Expressivity","level":2,"score":0.3009999990463257},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C50897621","wikidata":"https://www.wikidata.org/wiki/Q2665508","display_name":"Hybrid system","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C2779714256","wikidata":"https://www.wikidata.org/wiki/Q25305062","display_name":"Multiple Models","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.03444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.03444","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.465425580739975,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"work":[1],"has":[2],"demonstrated":[3],"the":[4,31,39,54,76,113,137,150,158,210,229],"potential":[5,32],"of":[6,34,43,56,78,139],"non-transformer":[7],"language":[8,230],"models,":[9],"especially":[10],"linear":[11,81],"recurrent":[12,222],"neural":[13],"networks":[14],"(RNNs)":[15],"and":[16,22,41,80,133,198,221],"hybrid":[17,57,70,140,151,217],"models":[18,58,71,141,218,249],"that":[19,69,124,149,216,250],"mix":[20],"recurrence":[21],"attention.":[23],"Yet":[24],"there":[25],"is":[26],"no":[27],"consensus":[28],"on":[29,62,170,182],"whether":[30],"benefits":[33],"these":[35],"new":[36],"architectures":[37],"justify":[38],"risk":[40],"effort":[42],"scaling":[44,178,207],"them":[45],"up.":[46],"To":[47,189],"address":[48],"this,":[49],"we":[50,67,98,194],"provide":[51],"evidence":[52],"for":[53],"advantages":[55],"over":[59],"pure":[60],"transformers":[61,79],"several":[63],"fronts.":[64],"First,":[65],"theoretically,":[66],"show":[68,123],"do":[72],"not":[73,233],"merely":[74,234],"inherit":[75],"expressivity":[77,169,202],"RNNs,":[82],"but":[83,111,240],"can":[84],"express":[85],"tasks":[86,184],"beyond":[87],"both,":[88],"such":[89],"as":[90,241],"code":[91],"execution.":[92],"Putting":[93],"this":[94,191],"theory":[95,197],"to":[96,107,186,196,205,228,235,245],"practice,":[97],"train":[99],"Olmo":[100,108,125,128],"Hybrid,":[101],"a":[102,143,225,242],"7B-parameter":[103],"model":[104,152],"largely":[105],"comparable":[106],"3":[109,129],"7B":[110],"with":[112],"sliding":[114],"window":[115],"layers":[116,223],"replaced":[117],"by":[118],"Gated":[119],"DeltaNet":[120],"layers.":[121],"We":[122,147],"Hybrid":[126],"outperforms":[127],"across":[130],"standard":[131],"pretraining":[132],"mid-training":[134],"evaluations,":[135],"demonstrating":[136],"benefit":[138],"in":[142,176],"controlled,":[144],"large-scale":[145],"setting.":[146],"find":[148],"scales":[153],"significantly":[154],"more":[155,247],"efficiently":[156],"than":[157],"transformer,":[159],"explaining":[160],"its":[161,165],"higher":[162],"performance.":[163],"However,":[164],"unclear":[166],"why":[167,200],"greater":[168],"specific":[171],"formal":[172],"problems":[173],"should":[174,203],"result":[175],"better":[177,206,252],"or":[179],"superior":[180],"performance":[181],"downstream":[183],"unrelated":[185],"those":[187],"problems.":[188],"explain":[190],"apparent":[192],"gap,":[193],"return":[195],"argue":[199],"increased":[201],"translate":[204],"efficiency,":[208],"completing":[209],"loop.":[211],"Overall,":[212],"our":[213],"results":[214],"suggest":[215],"mixing":[219],"attention":[220],"are":[224],"powerful":[226],"extension":[227],"modeling":[231],"paradigm:":[232],"reduce":[236],"memory":[237],"during":[238,253],"inference,":[239],"fundamental":[243],"way":[244],"obtain":[246],"expressive":[248],"scale":[251],"pretraining.":[254]},"counts_by_year":[],"updated_date":"2026-04-16T08:26:57.006410","created_date":"2026-04-08T00:00:00"}
