{"id":"https://openalex.org/W7135236051","doi":"https://doi.org/10.48550/arxiv.2603.11220","title":"Frequency-Modulated Visual Restoration for Matryoshka Large Multimodal Models","display_name":"Frequency-Modulated Visual Restoration for Matryoshka Large Multimodal Models","publication_year":2026,"publication_date":"2026-03-11","ids":{"openalex":"https://openalex.org/W7135236051","doi":"https://doi.org/10.48550/arxiv.2603.11220"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11220","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11220","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11220","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129094652","display_name":"Qingtao Pan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Qingtao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123475562","display_name":"Zhihao Dou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dou, Zhihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129062631","display_name":"Shuo Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Shuo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.38929998874664307,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.38929998874664307,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.24050000309944153,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.1451999992132187,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6008999943733215},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5636000037193298},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.5393000245094299},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.47450000047683716},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.45570001006126404},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.4375},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4052000045776367}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.729200005531311},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6026999950408936},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6008999943733215},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5636000037193298},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.5393000245094299},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.47450000047683716},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.45750001072883606},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.45570001006126404},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.4375},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4052000045776367},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.39629998803138733},{"id":"https://openalex.org/C178278151","wikidata":"https://www.wikidata.org/wiki/Q7936607","display_name":"Visual memory","level":3,"score":0.3693999946117401},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3677000105381012},{"id":"https://openalex.org/C160086991","wikidata":"https://www.wikidata.org/wiki/Q5939193","display_name":"Human visual system model","level":3,"score":0.34130001068115234},{"id":"https://openalex.org/C106430172","wikidata":"https://www.wikidata.org/wiki/Q6002272","display_name":"Image restoration","level":4,"score":0.2685000002384186},{"id":"https://openalex.org/C158495155","wikidata":"https://www.wikidata.org/wiki/Q2369151","display_name":"Visual search","level":2,"score":0.25999999046325684},{"id":"https://openalex.org/C2779321571","wikidata":"https://www.wikidata.org/wiki/Q7936605","display_name":"Visual learning","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11220","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11220","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11220","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11220","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Sustainable cities and communities","score":0.5500841736793518,"id":"https://metadata.un.org/sdg/11"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Multimodal":[1],"Models":[2],"(LMMs)":[3],"struggle":[4],"to":[5,11,18,56,105,120,152,160],"adapt":[6],"varying":[7],"computational":[8],"budgets":[9],"due":[10],"numerous":[12],"visual":[13,23,38,64,71,75,108,123,130,135,142,155,166],"tokens.":[14],"Previous":[15],"methods":[16],"attempted":[17],"reduce":[19,186],"the":[20,35,58,70,111,127,138,163,187,198],"number":[21,164],"of":[22,37,61,73,129,140,165,189,197],"tokens":[24,76,136,167],"before":[25],"or":[26],"within":[27],"LLMs.":[28],"However,":[29],"these":[30,42],"strategies":[31],"inevitably":[32],"result":[33],"in":[34],"loss":[36],"semantic.":[39],"To":[40],"address":[41],"issues,":[43],"we":[44,145],"introduce":[45],"FMVR,":[46],"a":[47,102],"plug-and-play":[48],"and":[49,79,84,137,178],"extremely":[50],"simple":[51],"Frequency-Modulated":[52],"Visual":[53],"Restoration":[54],"strategy":[55],"boost":[57],"reasoning":[59],"ability":[60],"LMMs":[62],"under":[63],"token":[65,156],"reduction.":[66],"Specifically,":[67],"FMVR":[68,147],"disentangles":[69],"representation":[72],"fewer":[74],"into":[77,148],"low-":[78],"high-frequency":[80,97],"components":[81],"through":[82],"AvgPool":[83,99],"MaxPool.":[85],"The":[86,96,201],"derived":[87],"frequencies":[88],"are":[89],"subsequently":[90],"modulated":[91],"using":[92],"lightweight":[93],"learnable":[94],"parameters.":[95],"from":[98,113],"acts":[100,115],"as":[101,116],"saliency":[103,107],"filter":[104,119],"enhance":[106],"semantics,":[109],"while":[110,170,193],"low-frequency":[112],"MaxPool":[114],"an":[117],"anti-saliency":[118],"strengthen":[121],"weak":[122],"semantics.":[124,143],"It":[125],"enables":[126],"preservation":[128],"semantics":[131],"dominated":[132],"by":[133,191],"few":[134],"restoration":[139],"diluted":[141],"Additionally,":[144],"inject":[146],"Matryoshka":[149],"Representation":[150],"Learning":[151],"learn":[153],"coarse-to-fine":[154],"sets,":[157],"thus":[158],"enabling":[159],"elastically":[161],"adjust":[162],"during":[168],"inference":[169],"maintaining":[171,194],"comparable":[172],"performance.":[173],"Experiments":[174],"across":[175],"10":[176],"image-based":[177],"4":[179],"video-based":[180],"bench":[181],"marks":[182],"demonstrate":[183],"that":[184],"FMVR-LLaVA":[185],"FLOPs":[188],"LLaVA-1.5-7B":[190],"89%,":[192],"almost":[195],"100%":[196],"original":[199],"accuracy.":[200],"code":[202],"will":[203],"be":[204],"open.":[205]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-14T00:00:00"}
