{"id":"https://openalex.org/W7161811117","doi":"https://doi.org/10.48550/arxiv.2605.20035","title":"Stage-adaptive Token Selection for Efficient Omni-modal LLMs","display_name":"Stage-adaptive Token Selection for Efficient Omni-modal LLMs","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7161811117","doi":"https://doi.org/10.48550/arxiv.2605.20035"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.20035","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20035","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.20035","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113398435","display_name":"Zijie Xin","orcid":"https://orcid.org/0000-0002-9220-8735"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin, Zijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136596561","display_name":"Jie Yang","orcid":"https://orcid.org/0009-0002-1860-8367"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Jie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136539284","display_name":"Ruixiang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Ruixiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136603744","display_name":"Tianyi Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Tianyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076304703","display_name":"Fengyun Rao","orcid":"https://orcid.org/0000-0002-2868-2088"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rao, Fengyun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136579718","display_name":"Jing Lyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5060270456","display_name":"Xirong Li","orcid":"https://orcid.org/0000-0002-0220-8310"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xirong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.46630001068115234,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.46630001068115234,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.13339999318122864,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.11729999631643295,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8722000122070312},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5493999719619751},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.535099983215332},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.531499981880188},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.527400016784668},{"id":"https://openalex.org/keywords/rapid-serial-visual-presentation","display_name":"Rapid serial visual presentation","score":0.3953000009059906},{"id":"https://openalex.org/keywords/token-passing","display_name":"Token passing","score":0.3919999897480011},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.36390000581741333},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.3490999937057495}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8722000122070312},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8360000252723694},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.535099983215332},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.531499981880188},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.527400016784668},{"id":"https://openalex.org/C2779668609","wikidata":"https://www.wikidata.org/wiki/Q623092","display_name":"Rapid serial visual presentation","level":3,"score":0.3953000009059906},{"id":"https://openalex.org/C115067241","wikidata":"https://www.wikidata.org/wiki/Q1639854","display_name":"Token passing","level":3,"score":0.3919999897480011},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.36390000581741333},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3490999937057495},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.34389999508857727},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3357999920845032},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.32659998536109924},{"id":"https://openalex.org/C2780186347","wikidata":"https://www.wikidata.org/wiki/Q11414","display_name":"Subnetwork","level":2,"score":0.32260000705718994},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.3116999864578247},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.28700000047683716},{"id":"https://openalex.org/C2775973920","wikidata":"https://www.wikidata.org/wiki/Q3252726","display_name":"Selection algorithm","level":3,"score":0.27810001373291016},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.27630001306533813},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2612999975681305},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.2558000087738037},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2533999979496002},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.251800000667572}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.20035","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20035","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.20035","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20035","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Omni-modal":[0],"large":[1],"language":[2],"models":[3],"(om-LLMs)":[4],"achieve":[5],"unified":[6],"audio-visual":[7],"understanding":[8],"by":[9,115],"encoding":[10],"video":[11],"and":[12,91,98,151,183,197,206],"audio":[13,92,198],"into":[14],"temporally":[15],"aligned":[16],"token":[17,39,69,83,124],"sequences":[18],"interleaved":[19],"at":[20],"the":[21,31,58,81,132,143,154,215],"window":[22],"level.":[23],"However,":[24],"processing":[25],"these":[26],"dense":[27],"non-textual":[28,107,173],"tokens":[29,55,108,148,174],"throughout":[30],"LLM":[32,59],"incurs":[33],"substantial":[34],"computational":[35],"overhead.":[36],"Although":[37],"training-free":[38],"selection":[40,125],"can":[41],"reduce":[42],"this":[43,76,116],"cost,":[44],"existing":[45],"methods":[46],"either":[47],"focus":[48],"on":[49,181],"visual-only":[50],"inputs":[51],"or":[52],"prune":[53],"om-LLM":[54,129],"only":[56,193],"before":[57],"with":[60,101],"fixed":[61],"per-modality":[62],"ratios,":[63],"failing":[64],"to":[65,160],"capture":[66],"how":[67],"cross-modal":[68,112,176],"importance":[70],"evolves":[71],"across":[72,149],"layers.":[73],"To":[74],"address":[75],"limitation,":[77],"we":[78,118],"first":[79],"analyze":[80],"layer-wise":[82],"dependency":[84],"of":[85,195,214],"om-LLMs.":[86],"We":[87],"find":[88],"that":[89,104,186],"visual":[90,196],"dependencies":[93],"follow":[94],"a":[95,121,202,207],"block-wise":[96],"pattern":[97],"gradually":[99],"weaken":[100],"depth,":[102],"indicating":[103],"many":[105],"late-layer":[106],"become":[109],"redundant":[110],"after":[111],"fusion.":[113],"Motivated":[114],"observation,":[117],"propose":[119],"SEATS,":[120],"training-free,":[122],"stage-adaptive":[123],"method":[126],"for":[127],"efficient":[128],"inference.":[130],"Before":[131],"LLM,":[133,144],"SEATS":[134,187],"removes":[135,170],"spatiotemporal":[136],"redundancy":[137],"via":[138],"attention-weighted":[139],"diversity":[140],"selection.":[141],"Inside":[142],"it":[145,169,200],"progressively":[146],"prunes":[147],"blocks":[150],"dynamically":[152],"allocates":[153],"retention":[155],"budget":[156],"from":[157],"temporal":[158],"windows":[159],"modalities":[161],"using":[162],"query":[163],"relevance":[164],"scores.":[165],"In":[166],"late":[167],"layers,":[168],"all":[171],"remaining":[172],"once":[175],"fusion":[177],"is":[178],"complete.":[179],"Experiments":[180],"Qwen2.5-Omni":[182],"Qwen3-Omni":[184],"demonstrate":[185],"effectively":[188],"improves":[189],"inference":[190],"efficiency.":[191],"Retaining":[192],"10%":[194],"tokens,":[199],"achieves":[201],"9.3x":[203],"FLOPs":[204],"reduction":[205],"4.8x":[208],"prefill":[209],"speedup":[210],"while":[211],"preserving":[212],"96.3%":[213],"original":[216],"performance.":[217]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-21T00:00:00"}
