{"id":"https://openalex.org/W7128435804","doi":"https://doi.org/10.1109/tr.2026.3661996","title":"TriVLLo: Tri-View Dynamic Architecture and Unified Cross-Modal Representation for Efficient Fine-Grained Vision\u2013Language Understanding","display_name":"TriVLLo: Tri-View Dynamic Architecture and Unified Cross-Modal Representation for Efficient Fine-Grained Vision\u2013Language Understanding","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7128435804","doi":"https://doi.org/10.1109/tr.2026.3661996"},"language":null,"primary_location":{"id":"doi:10.1109/tr.2026.3661996","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tr.2026.3661996","pdf_url":null,"source":{"id":"https://openalex.org/S87725633","display_name":"IEEE Transactions on Reliability","issn_l":"0018-9529","issn":["0018-9529","1558-1721"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Reliability","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011263795","display_name":"Liang Kou","orcid":"https://orcid.org/0000-0002-4625-4271"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Liang Kou","raw_affiliation_strings":["College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"],"affiliations":[{"raw_affiliation_string":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002524160","display_name":"Wenlong Fan","orcid":null},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenlong Fan","raw_affiliation_strings":["College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"],"affiliations":[{"raw_affiliation_string":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125438769","display_name":"Xingru Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingru Huang","raw_affiliation_strings":["College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125464549","display_name":"Bai Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai Lin","raw_affiliation_strings":["Systems Engineering Institute, AMS, PLA, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Systems Engineering Institute, AMS, PLA, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125460858","display_name":"Bo Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bo Yang","raw_affiliation_strings":["Systems Engineering Institute, AMS, PLA, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Systems Engineering Institute, AMS, PLA, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100624323","display_name":"Jilin Zhang","orcid":"https://orcid.org/0000-0003-0241-0727"},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jilin Zhang","raw_affiliation_strings":["College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China"],"affiliations":[{"raw_affiliation_string":"College of Cyberspace, Hangzhou Dianzi University, Hang Zhou, China","institution_ids":["https://openalex.org/I50760025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125457381","display_name":"Yun Lin","orcid":null},"institutions":[{"id":"https://openalex.org/I151727225","display_name":"Harbin Engineering University","ror":"https://ror.org/03x80pn82","country_code":"CN","type":"education","lineage":["https://openalex.org/I151727225"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yun Lin","raw_affiliation_strings":["College of Information and Communication Engineering, Harbin Engineering University, Harbin, China"],"affiliations":[{"raw_affiliation_string":"College of Information and Communication Engineering, Harbin Engineering University, Harbin, China","institution_ids":["https://openalex.org/I151727225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076103896","display_name":"Meiyu Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I50760025","display_name":"Hangzhou Dianzi University","ror":"https://ror.org/0576gt767","country_code":"CN","type":"education","lineage":["https://openalex.org/I50760025"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Meiyu Wang","raw_affiliation_strings":["College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"College of Communication Engineering, Hangzhou Dianzi University, Hangzhou, China","institution_ids":["https://openalex.org/I50760025"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5011263795"],"corresponding_institution_ids":["https://openalex.org/I50760025"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37825891,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"75","issue":null,"first_page":"1281","last_page":"1290"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9764999747276306,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9764999747276306,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.006000000052154064,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.649399995803833},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6340000033378601},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6126000285148621},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.5195000171661377},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.4569999873638153},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4392000138759613},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4390999972820282},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.42179998755455017},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.4156000018119812}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8393999934196472},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.649399995803833},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6340000033378601},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6126000285148621},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6116999983787537},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.5195000171661377},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.4569999873638153},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4392000138759613},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4390999972820282},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.42179998755455017},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.4156000018119812},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.37139999866485596},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3357999920845032},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33009999990463257},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C190502265","wikidata":"https://www.wikidata.org/wiki/Q17069496","display_name":"MNIST database","level":3,"score":0.30230000615119934},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.2955000102519989},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.29260000586509705},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.289000004529953},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.2822999954223633},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.27559998631477356},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.26750001311302185},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25679999589920044},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.25279998779296875},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tr.2026.3661996","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tr.2026.3661996","pdf_url":null,"source":{"id":"https://openalex.org/S87725633","display_name":"IEEE Transactions on Reliability","issn_l":"0018-9529","issn":["0018-9529","1558-1721"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Reliability","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2185175083","https://openalex.org/W2412782625","https://openalex.org/W2560730294","https://openalex.org/W2886641317","https://openalex.org/W2964051877","https://openalex.org/W3098755434","https://openalex.org/W4402727536","https://openalex.org/W4404783352","https://openalex.org/W4413147805","https://openalex.org/W4415798090","https://openalex.org/W7133193597","https://openalex.org/W7133196460","https://openalex.org/W7133220561"],"related_works":[],"abstract_inverted_index":{"This":[0,43,83,125],"study":[1],"tackles":[2],"computational":[3,65],"bottlenecks,":[4],"training":[5,69],"instability,":[6],"and":[7,37,47,67,79,88,134],"insufficient":[8],"cross-modal":[9],"semantic":[10,132],"alignment":[11,104],"in":[12,129,158],"high-resolution":[13],"multimodal":[14],"image":[15,86],"processing.":[16],"We":[17],"propose":[18],"TriVLLo,":[19],"an":[20],"innovative":[21],"multi-scale":[22,76],"vision-language":[23,106],"modeling":[24],"framework.":[25],"Our":[26],"main":[27],"contributions":[28],"are:":[29],"First,":[30],"we":[31,72,114],"introduce":[32],"factorized":[33],"2D":[34],"positional":[35],"encoding":[36],"a":[38,74,109,116,152],"dynamically":[39],"configurable":[40],"modular":[41],"architecture.":[42],"approach":[44],"decouples":[45],"height":[46],"width":[48],"position":[49],"information.":[50],"It":[51,62,92,100,150,162],"improves":[52],"spatial":[53,159],"localization":[54],"reliability":[55],"for":[56,122],"images":[57],"with":[58,119],"extreme":[59],"aspect":[60],"ratios.":[61],"also":[63,101],"reduces":[64],"parameters":[66],"alleviates":[68],"instability.":[70],"Second,":[71],"design":[73],"unified":[75],"feature":[77,90],"extraction":[78],"modality":[80],"interaction":[81],"mechanism.":[82],"uses":[84],"adaptive":[85],"processing":[87],"multi-perspective":[89],"pyramids.":[91],"enhances":[93],"robustness":[94],"to":[95],"inputs":[96],"of":[97,105,143],"any":[98],"resolution.":[99],"achieves":[102,141],"fine-grained":[103,123],"features":[107],"through":[108],"shared":[110],"embedding":[111],"space.":[112],"Third,":[113],"build":[115],"high-quality":[117],"dataset":[118,126],"11K":[120],"samples":[121],"reasoning.":[124,136],"supports":[127],"improvements":[128],"visual":[130],"ranking,":[131],"alignment,":[133],"narrative":[135],"Experiments":[137],"show":[138],"that":[139],"TriVLLo":[140],"87.4%":[142],"GPT-4V's":[144],"performance":[145],"on":[146,166],"the":[147],"MM-Vet":[148],"benchmark.":[149],"demonstrates":[151],"93.0":[153],"percentage-point":[154],"improvement":[155],"over":[156],"Emu2":[157],"cognition":[160],"tasks.":[161,169],"attains":[163],"89.2%":[164],"accuracy":[165],"knowledge":[167],"generation":[168],"These":[170],"results":[171],"significantly":[172],"outperform":[173],"state-of-the-art":[174],"methods.":[175]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2026-02-10T00:00:00"}
