{"id":"https://openalex.org/W7157223413","doi":"https://doi.org/10.48550/arxiv.2604.24715","title":"Long-Context Aware Upcycling: A New Frontier for Hybrid LLM Scaling","display_name":"Long-Context Aware Upcycling: A New Frontier for Hybrid LLM Scaling","publication_year":2026,"publication_date":"2026-04-27","ids":{"openalex":"https://openalex.org/W7157223413","doi":"https://doi.org/10.48550/arxiv.2604.24715"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.24715","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24715","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.24715","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5069269180","display_name":"Parsa Ashrafi Fashi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fashi, Parsa Ashrafi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134796178","display_name":"Utkarsh Saxena","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saxena, Utkarsh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028862918","display_name":"Mehdi Rezagholizadeh","orcid":"https://orcid.org/0000-0003-4014-6007"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rezagholizadeh, Mehdi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078959979","display_name":"Aref Jafari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jafari, Aref","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134814955","display_name":"Akash Haridas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haridas, Akash","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134780707","display_name":"Mingyu Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Mingyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038603539","display_name":"Vansh Bhatia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bhatia, Vansh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108578601","display_name":"Guihong Li","orcid":"https://orcid.org/0000-0001-8537-8632"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guihong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120362351","display_name":"Vikram Appia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Appia, Vikram","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134781363","display_name":"Emad Barsoum","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Barsoum, Emad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.33959999680519104,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.33959999680519104,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.10180000215768814,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.07530000060796738,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7351999878883362},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.6775000095367432},{"id":"https://openalex.org/keywords/usable","display_name":"USable","score":0.6007000207901001},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5210999846458435},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4875999987125397},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.35839998722076416},{"id":"https://openalex.org/keywords/recipe","display_name":"Recipe","score":0.3578999936580658}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7351999878883362},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7336999773979187},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.6775000095367432},{"id":"https://openalex.org/C2780615836","wikidata":"https://www.wikidata.org/wiki/Q2471869","display_name":"USable","level":2,"score":0.6007000207901001},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5210999846458435},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4875999987125397},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4334000051021576},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3734999895095825},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.35839998722076416},{"id":"https://openalex.org/C2778671685","wikidata":"https://www.wikidata.org/wiki/Q219239","display_name":"Recipe","level":2,"score":0.3578999936580658},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.3343999981880188},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.32019999623298645},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.28360000252723694},{"id":"https://openalex.org/C50897621","wikidata":"https://www.wikidata.org/wiki/Q2665508","display_name":"Hybrid system","level":2,"score":0.28290000557899475},{"id":"https://openalex.org/C165005293","wikidata":"https://www.wikidata.org/wiki/Q1074500","display_name":"Chip","level":2,"score":0.2700999975204468},{"id":"https://openalex.org/C2781235140","wikidata":"https://www.wikidata.org/wiki/Q275131","display_name":"Scratch","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.25920000672340393},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.25450000166893005}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.24715","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24715","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.24715","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.24715","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5272955894470215}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Hybrid":[0],"sequence":[1,10],"models":[2],"that":[3,69],"combine":[4],"efficient":[5,74,109],"Transformer":[6,33,45,75],"components":[7],"with":[8,73,89],"linear":[9,82],"modeling":[11],"blocks":[12,83],"are":[13,22],"a":[14,39,65],"promising":[15],"alternative":[16],"to":[17,30,42,106,121],"pure":[18],"Transformers,":[19],"but":[20],"most":[21],"still":[23],"pretrained":[24,44],"from":[25],"scratch":[26],"and":[27,54,81,93,111,124,144,148,156,159,195],"therefore":[28],"fail":[29],"reuse":[31],"existing":[32],"checkpoints.":[34],"We":[35,58],"study":[36],"upcycling":[37,67],"as":[38,170],"practical":[40],"path":[41],"convert":[43],"LLMs":[46],"into":[47],"hybrid":[48,164],"architectures":[49],"while":[50,131],"preserving":[51],"short-context":[52],"quality":[53],"improving":[55],"long-context":[56,66,91,157,167],"capability.":[57],"call":[59],"our":[60,127],"solution":[61],"\\emph{HyLo}":[62],"(HYbrid":[63],"LOng-context):":[64],"recipe":[68],"combines":[70],"architectural":[71],"adaptation":[72],"blocks,":[76],"Multi-Head":[77],"Latent":[78],"Attention":[79],"(MLA),":[80],"(Mamba2":[84],"or":[85],"Gated":[86],"DeltaNet),":[87],"together":[88],"staged":[90],"training":[92],"teacher-guided":[94],"distillation":[95],"for":[96],"stable":[97],"optimization.":[98],"HyLo":[99,151],"extends":[100],"usable":[101],"context":[102],"length":[103],"by":[104,115],"up":[105,120],"$32\\times$":[107],"through":[108],"post-training":[110],"reduces":[112],"KV-cache":[113],"memory":[114,138],"more":[116],"than":[117],"$90\\%$,":[118],"enabling":[119],"2M-token":[122],"prefill":[123],"decoding":[125],"in":[126],"\\texttt{vLLM}":[128],"inference":[129],"stack,":[130],"comparable":[132],"Llama":[133],"baselines":[134,165],"run":[135],"out":[136],"of":[137],"beyond":[139],"64K":[140],"context.":[141],"Across":[142],"1B-":[143],"3B-scale":[145],"settings":[146],"(Llama-":[147],"Qwen-based":[149],"variants),":[150],"delivers":[152],"consistently":[153],"strong":[154],"short-":[155],"performance":[158],"significantly":[160,182],"outperforms":[161,183],"state-of-the-art":[162],"upcycled":[163],"on":[166,178,186,189],"evaluations":[168],"such":[169],"RULER.":[171],"Notably,":[172],"at":[173],"similar":[174],"scale,":[175],"HyLo-Qwen-1.7B":[176],"trained":[177],"only":[179],"10B":[180],"tokens":[181],"JetNemotron":[184],"(trained":[185],"400B":[187],"tokens)":[188],"GSM8K,":[190],"Lm-Harness":[191],"common":[192],"sense":[193],"reasoning":[194],"RULER-64K.":[196]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-29T00:00:00"}
