{"id":"https://openalex.org/W7133331446","doi":"https://doi.org/10.48550/arxiv.2603.00171","title":"SvfEye: A Semantic-Visual Fusion Framework with Multi-Scale Visual Context for Multimodal Reasoning","display_name":"SvfEye: A Semantic-Visual Fusion Framework with Multi-Scale Visual Context for Multimodal Reasoning","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7133331446","doi":"https://doi.org/10.48550/arxiv.2603.00171"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00171","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00171","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00171","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025435900","display_name":"Yuxiang Shen","orcid":"https://orcid.org/0000-0001-8372-399X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Shen, Yuxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037131765","display_name":"Hailong Huang","orcid":"https://orcid.org/0000-0003-2667-6423"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Hailong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026461494","display_name":"Gao Zhenkun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Zhenkun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121802396","display_name":"Xueheng Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xueheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhou, Man","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Man","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013112875","display_name":"Chengjun Xie","orcid":"https://orcid.org/0000-0002-0629-2038"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Chengjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084761959","display_name":"Haoxuan Che","orcid":"https://orcid.org/0000-0002-5844-1285"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Che, Haoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129454769","display_name":"Xuanhua He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Xuanhua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129498452","display_name":"Jie Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5025435900"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9447000026702881,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9447000026702881,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.018799999728798866,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.01119999960064888,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.8675000071525574},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5756999850273132},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.566100001335144},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.48100000619888306},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4251999855041504},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.42149999737739563},{"id":"https://openalex.org/keywords/visual-objects","display_name":"Visual Objects","score":0.3880000114440918}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.8675000071525574},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7781000137329102},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6279000043869019},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5756999850273132},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.566100001335144},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.48100000619888306},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4251999855041504},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.42149999737739563},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4194999933242798},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.3880000114440918},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3718999922275543},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3427000045776367},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.319599986076355},{"id":"https://openalex.org/C193221554","wikidata":"https://www.wikidata.org/wiki/Q5153664","display_name":"Commonsense reasoning","level":2,"score":0.31290000677108765},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.2757999897003174},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2619999945163727}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00171","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00171","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00171","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00171","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5197779536247253,"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities"},{"score":0.402921587228775,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"often":[5],"struggle":[6],"to":[7,54,104,182,196],"accurately":[8],"perceive":[9],"fine-grained":[10,36],"visual":[11,41,57,107,128,150,187],"details,":[12],"especially":[13],"when":[14],"targets":[15],"are":[16],"tiny":[17],"or":[18,61],"visually":[19],"subtle.":[20],"This":[21],"challenge":[22],"can":[23],"be":[24],"addressed":[25],"through":[26],"semantic-visual":[27],"information":[28,188],"fusion,":[29],"which":[30],"integrates":[31],"global":[32,71],"image":[33,63],"context":[34,72],"with":[35,48,70,177],"local":[37,68,106,127,186,199],"evidence":[38,58,108],"for":[39,130,168],"multi-scale":[40],"understanding.":[42],"Recently,":[43],"a":[44,101,165,174,178,192],"paradigm":[45],"termed":[46],"\"Thinking":[47],"Images\"":[49],"has":[50],"emerged,":[51],"enabling":[52],"models":[53],"acquire":[55],"high-resolution":[56],"by":[59],"zooming":[60],"cropping":[62],"regions":[64,129],"and":[65,89,125,139,149,191],"fusing":[66],"these":[67,160],"details":[69],"during":[73,109],"reasoning.":[74],"Although":[75],"training-based":[76],"approaches":[77,114],"demonstrate":[78],"the":[79,217],"effectiveness":[80],"of":[81,134,155],"this":[82],"capability,":[83],"they":[84,122,143],"require":[85],"extensive":[86],"computational":[87,137],"resources":[88],"large-scale":[90],"task-specific":[91],"data.":[92],"Consequently,":[93],"lightweight":[94],"training-free":[95,113,166],"methods":[96],"have":[97],"been":[98],"proposed":[99],"as":[100],"practical":[102],"alternative":[103],"incorporate":[105],"inference.":[110],"However,":[111],"existing":[112],"still":[115],"suffer":[116],"from":[117],"two":[118],"key":[119],"limitations.":[120],"First,":[121],"indiscriminately":[123],"extract":[124],"fuse":[126],"all":[131],"inputs":[132],"regardless":[133],"necessity,":[135],"introducing":[136],"redundancy":[138],"perceptual":[140],"noise.":[141],"Second,":[142],"exhibit":[144],"drift":[145],"between":[146],"semantic":[147],"intent":[148],"attention,":[151],"preventing":[152],"accurate":[153],"localization":[154],"user-focused":[156],"regions.":[157,200],"To":[158],"address":[159],"challenges,":[161],"we":[162],"propose":[163],"SvfEye,":[164],"framework":[167],"adaptive":[169],"visual-semantic":[170],"fusion.":[171],"SvfEye":[172,204],"follows":[173],"two-stage":[175],"pipeline":[176],"confidence-based":[179],"decision":[180],"module":[181,195],"determine":[183],"whether":[184],"additional":[185],"is":[189],"needed,":[190],"semantic-attention":[193],"fusion":[194],"identify":[197],"informative":[198],"Experiments":[201],"show":[202],"that":[203],"achieves":[205],"substantial":[206],"performance":[207],"gains":[208],"while":[209],"obtaining":[210],"an":[211],"approximately":[212],"4.0x":[213],"inference":[214],"speedup":[215],"over":[216],"state-of-the-art":[218],"method":[219],"ZoomEye.":[220]},"counts_by_year":[],"updated_date":"2026-03-17T06:59:57.516163","created_date":"2026-03-04T00:00:00"}
