{"id":"https://openalex.org/W4415539680","doi":"https://doi.org/10.1145/3746027.3758234","title":"Towards High Robust Vision-Language Large Models: Benchmark and Method","display_name":"Towards High Robust Vision-Language Large Models: Benchmark and Method","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539680","doi":"https://doi.org/10.1145/3746027.3758234"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3758234","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3758234","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045420812","display_name":"Minyi Zhao","orcid":"https://orcid.org/0000-0001-7720-806X"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Minyi Zhao","raw_affiliation_strings":["College of CS and AI, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"College of CS and AI, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114026324","display_name":"Yi Liu","orcid":"https://orcid.org/0009-0008-2227-0024"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi Liu","raw_affiliation_strings":["ByteDance Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109316445","display_name":"Wantao He","orcid":"https://orcid.org/0009-0000-9713-6008"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wensong He","raw_affiliation_strings":["ByteDance Inc., Beijing, China"],"affiliations":[{"raw_affiliation_string":"ByteDance Inc., Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064265879","display_name":"Bingzhe Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bingzhe Yu","raw_affiliation_strings":["College of CS and AI, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"College of CS and AI, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035201632","display_name":"Yuxi Mi","orcid":"https://orcid.org/0000-0002-1006-6041"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxi Mi","raw_affiliation_strings":["College of CS and AI, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"College of CS and AI, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017862559","display_name":"Shuigeng Zhou","orcid":"https://orcid.org/0000-0002-1949-2768"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuigeng Zhou","raw_affiliation_strings":["College of CS and AI, Fudan University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"College of CS and AI, Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5045420812"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30641902,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"12897","last_page":"12904"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.8723000288009644},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.3865000009536743},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.3797999918460846},{"id":"https://openalex.org/keywords/robustness-testing","display_name":"Robustness testing","score":0.35199999809265137},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.2985999882221222}],"concepts":[{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.8723000288009644},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7394999861717224},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4896000027656555},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.45660001039505005},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4546999931335449},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3865000009536743},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3797999918460846},{"id":"https://openalex.org/C137726913","wikidata":"https://www.wikidata.org/wiki/Q7353550","display_name":"Robustness testing","level":3,"score":0.35199999809265137},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C95713431","wikidata":"https://www.wikidata.org/wiki/Q631425","display_name":"Vulnerability (computing)","level":2,"score":0.2766000032424927},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.27459999918937683}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3758234","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3758234","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2560730294","https://openalex.org/W2753709519","https://openalex.org/W2963518342","https://openalex.org/W2963622213","https://openalex.org/W2979382951","https://openalex.org/W3204971388","https://openalex.org/W4225832925","https://openalex.org/W4285247752","https://openalex.org/W4312651322","https://openalex.org/W4322766882","https://openalex.org/W4402726948","https://openalex.org/W4402727764"],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"numerous":[1],"benchmarks":[2],"have":[3,23],"been":[4],"constructed":[5],"to":[6,67,73,89],"evaluate":[7],"various":[8],"general":[9],"capabilities":[10],"(e.g.,":[11],"perception":[12],"and":[13,35,48,63,71,116],"reasoning)":[14],"of":[15,28,94,109,119],"Vision-Language":[16],"Large":[17],"Models":[18],"(VLLMs).":[19],"However,":[20],"few":[21],"studies":[22],"focused":[24],"on":[25],"the":[26,69,92,107,117],"robustness":[27,93],"VLLMs":[29,96,111],"when":[30,112],"dealing":[31],"with":[32,60],"altered":[33,114],"prompts":[34,62],"images.":[36],"To":[37],"fill":[38],"this":[39,41],"gap,":[40],"paper":[42],"first":[43],"constructs":[44],"a":[45,80],"real-world,":[46],"high-quality,":[47],"challenging":[49],"benchmark,":[50],"namely":[51],"RBench":[52,57,127],"(i.e.,":[53],"Robust":[54],"Bench).":[55],"Specifically,":[56],"is":[58,128],"human-annotated,":[59],"both":[61],"images":[64],"being":[65],"modified":[66],"enrich":[68],"difficulty,":[70],"cross-validation":[72],"ensure":[74],"data":[75],"quality.":[76],"Then,":[77],"we":[78],"propose":[79],"new":[81],"method,":[82],"called":[83],"Robustness":[84],"Booster":[85],"(RBoost":[86],"in":[87,123],"short),":[88],"effectively":[90],"enhance":[91],"existing":[95,110],"by":[97],"automatically":[98],"generating":[99],"high-value":[100],"instruction-tuning":[101],"training":[102],"data.":[103],"Extensive":[104],"experiments":[105],"demonstrate":[106],"vulnerability":[108],"handling":[113],"inputs,":[115],"superiority":[118],"our":[120],"RBoost":[121],"method":[122],"improving":[124],"model":[125],"robustness.":[126],"available":[129],"at":[130],"https://github.com/zhaominyiz/RBench.":[131]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
