{"id":"https://openalex.org/W7118244052","doi":"https://doi.org/10.48550/arxiv.2601.01322","title":"LinMU: Multimodal Understanding Made Linear","display_name":"LinMU: Multimodal Understanding Made Linear","publication_year":2026,"publication_date":"2026-01-04","ids":{"openalex":"https://openalex.org/W7118244052","doi":"https://doi.org/10.48550/arxiv.2601.01322"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.01322","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01322","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.01322","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122038572","display_name":"Hongjie Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Hongjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5086131079","display_name":"Niraj K. Jha","orcid":"https://orcid.org/0000-0002-1539-0369"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jha, Niraj K.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5122038572"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9480999708175659,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9480999708175659,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.010300000198185444,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.009499999694526196,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.6381000280380249},{"id":"https://openalex.org/keywords/quadratic-equation","display_name":"Quadratic equation","score":0.5364999771118164},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.5182999968528748},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.508899986743927},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.3917999863624573},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3700000047683716},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.36800000071525574},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.33149999380111694}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.776199996471405},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6381000280380249},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.5364999771118164},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.5182999968528748},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.508899986743927},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49549999833106995},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.43160000443458557},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3917999863624573},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3783999979496002},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3700000047683716},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.36800000071525574},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.32409998774528503},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.3052000105381012},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.30320000648498535},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.2994000017642975},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.29589998722076416},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.28519999980926514},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.2847000062465668},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.2623000144958496}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.01322","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01322","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.01322","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.01322","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5827216506004333,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"Vision-Language":[1],"Models":[2],"(VLMs)":[3],"achieve":[4],"impressive":[5],"performance":[6,60,178],"but":[7],"are":[8],"limited":[9],"by":[10,186,194],"the":[11,59,70,73,106,125,131,139,145,163,177,203,210,213,217],"quadratic":[12,232],"complexity":[13,51],"of":[14,27,61,162,179,205,212,216],"self-attention,":[15],"which":[16],"prevents":[17],"their":[18,25],"deployment":[19],"on":[20,156,198],"edge":[21],"devices":[22],"and":[23,30,123,134,142,148,159,172,190,209,247],"makes":[24],"understanding":[26],"high-resolution":[28,245],"images":[29,246],"long-context":[31,239],"videos":[32],"prohibitively":[33],"expensive.":[34],"To":[35,100],"address":[36],"this":[37],"challenge,":[38],"we":[39,109],"introduce":[40],"LinMU":[41,64,107,175],"(Linear-complexity":[42],"Multimodal":[43],"Understanding),":[44],"a":[45,76,81,102,111],"VLM":[46,71,104,165],"design":[47],"that":[48,79,115,224,241],"achieves":[49],"linear":[50],"without":[52,231],"using":[53,151],"any":[54],"quadratic-complexity":[55],"modules":[56],"while":[57,154],"maintaining":[58],"global-attention-based":[62],"VLMs.":[63],"replaces":[65],"every":[66],"self-attention":[67,121],"layer":[68],"in":[69],"with":[72,90,120,138,244],"M-MATE":[74,218],"block:":[75],"dual-branch":[77],"module":[78],"combines":[80],"bidirectional":[82],"state-space":[83],"model":[84],"for":[85,97,238],"global":[86],"context":[87],"(Flex-MA":[88],"branch)":[89,96],"localized":[91],"Swin-style":[92],"window":[93],"attention":[94],"(Local-Swin":[95],"adjacent":[98],"correlations.":[99],"transform":[101],"pre-trained":[103],"into":[105],"architecture,":[108],"propose":[110],"three-stage":[112],"distillation":[113,207],"framework":[114,222],"(i)":[116],"initializes":[117],"both":[118],"branches":[119,215],"weights":[122],"trains":[124],"Flex-MA":[126,140],"branch":[127,133],"alone,":[128],"(ii)":[129],"unfreezes":[130,144],"Local-Swin":[132],"fine-tunes":[135,149],"it":[136],"jointly":[137],"branch,":[141],"(iii)":[143],"remaining":[146],"blocks":[147],"them":[150],"LoRA":[152],"adapters,":[153],"regressing":[155],"hidden":[157],"states":[158],"token-level":[160],"logits":[161],"frozen":[164],"teacher.":[166],"On":[167],"MMMU,":[168],"TextVQA,":[169],"LongVideoBench,":[170],"Video-MME,":[171],"other":[173],"benchmarks,":[174],"matches":[176],"teacher":[180],"models,":[181],"yet":[182],"reduces":[183],"Time-To-First-Token":[184],"(TTFT)":[185],"up":[187,195,236],"to":[188,196],"2.7$\\times$":[189],"improves":[191],"token":[192],"throughput":[193],"9.0$\\times$":[197],"minute-length":[199],"videos.":[200,249],"Ablations":[201],"confirm":[202],"importance":[204],"each":[206],"stage":[208],"necessity":[211],"two":[214],"block.":[219],"The":[220],"proposed":[221],"demonstrates":[223],"state-of-the-art":[225],"multimodal":[226],"reasoning":[227],"can":[228,242],"be":[229],"achieved":[230],"attention,":[233],"thus":[234],"opening":[235],"avenues":[237],"VLMs":[240],"deal":[243],"long":[248]},"counts_by_year":[],"updated_date":"2026-01-08T20:10:11.968330","created_date":"2026-01-08T00:00:00"}
