{"id":"https://openalex.org/W4415539627","doi":"https://doi.org/10.1145/3746027.3758235","title":"RSVLM-QA: A Benchmark Dataset for Remote Sensing Vision Language Model-based Question Answering","display_name":"RSVLM-QA: A Benchmark Dataset for Remote Sensing Vision Language Model-based Question Answering","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415539627","doi":"https://doi.org/10.1145/3746027.3758235"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3758235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3758235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3746027.3758235","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010907570","display_name":"Xing Zi","orcid":"https://orcid.org/0009-0001-4265-2205"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Xing Zi","raw_affiliation_strings":["School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0009-0001-4265-2205","affiliations":[{"raw_affiliation_string":"School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinghao Xiao","orcid":"https://orcid.org/0009-0000-0837-9068"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Jinghao Xiao","raw_affiliation_strings":["School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0009-0000-0837-9068","affiliations":[{"raw_affiliation_string":"School of Computer Science, The University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016524358","display_name":"Yunxiao Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yunxiao Shi","raw_affiliation_strings":["SEDE, University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0000-0002-1516-015X","affiliations":[{"raw_affiliation_string":"SEDE, University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082921939","display_name":"Xian Tao","orcid":"https://orcid.org/0000-0001-5834-5181"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xian Tao","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5834-5181","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100635867","display_name":"Jun Li","orcid":"https://orcid.org/0000-0002-1336-2241"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Jun Li","raw_affiliation_strings":["School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0000-0002-1336-2241","affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023107987","display_name":"Ali Braytee","orcid":"https://orcid.org/0000-0003-2561-6496"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ali Braytee","raw_affiliation_strings":["School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0000-0003-2561-6496","affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5006355592","display_name":"Mukesh Prasad","orcid":"https://orcid.org/0000-0002-7745-9667"},"institutions":[{"id":"https://openalex.org/I114017466","display_name":"University of Technology Sydney","ror":"https://ror.org/03f0f6041","country_code":"AU","type":"education","lineage":["https://openalex.org/I114017466"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Mukesh Prasad","raw_affiliation_strings":["School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia"],"raw_orcid":"https://orcid.org/0000-0002-7745-9667","affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Technology Sydney, Sydney, New South Wales, Australia","institution_ids":["https://openalex.org/I114017466"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5010907570"],"corresponding_institution_ids":["https://openalex.org/I114017466"],"apc_list":null,"apc_paid":null,"fwci":2.2665,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.90286726,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"12905","last_page":"12911"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9973999857902527,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10757","display_name":"Geographic Information Systems Studies","score":0.9546999931335449,"subfield":{"id":"https://openalex.org/subfields/3305","display_name":"Geography, Planning and Development"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.8284000158309937},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6306999921798706},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6200000047683716},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5665000081062317},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.46970000863075256},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.46880000829696655},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4472000002861023},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.41190001368522644}],"concepts":[{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.8284000158309937},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8162000179290771},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6306999921798706},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6200000047683716},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5665000081062317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5537999868392944},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.47519999742507935},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.46970000863075256},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.46880000829696655},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4684000015258789},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4472000002861023},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.41190001368522644},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.40959998965263367},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.38499999046325684},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.3652999997138977},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3402000069618225},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32499998807907104},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2671000063419342},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.26460000872612},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.2547999918460846},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3758235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3758235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3746027.3758235","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3746027.3758235","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W2510520237","https://openalex.org/W2609402060","https://openalex.org/W2779054585","https://openalex.org/W2908320224","https://openalex.org/W3011916860","https://openalex.org/W3012111773","https://openalex.org/W4386472879","https://openalex.org/W4392172801","https://openalex.org/W4402473945","https://openalex.org/W4406114612","https://openalex.org/W4409383105"],"related_works":[],"abstract_inverted_index":{"Visual":[0],"Question":[1,44],"Answering":[2,45],"(VQA)":[3],"in":[4,24,128,237,262],"remote":[5],"sensing":[6],"(RS)":[7],"is":[8,59],"pivotal":[9,248],"for":[10,54,250],"interpreting":[11],"Earth":[12],"observation":[13],"data.":[14],"However,":[15],"existing":[16,197],"RS":[17,56,67,129,198,239,252],"VQA":[18,52,117,175,199,253],"datasets":[19],"are":[20,158,272],"constrained":[21],"by":[22,61],"limitations":[23],"annotation":[25,82],"richness,":[26],"question":[27,162,182],"diversity,":[28],"and":[29,69,75,111,173,180,193,205,227,231,254,269],"the":[30,55,122,144,191,202,229,238,251,263],"assessment":[31],"of":[32,103,125,190,207,234],"specific":[33],"reasoning":[34,232],"capabilities.":[35],"This":[36],"paper":[37],"introduces":[38],"Remote":[39],"Sensing":[40],"Vision":[41,218],"Language":[42,89,219],"Model":[43],"(RSVLM-QA)":[46],"dataset,":[47,266],"a":[48,101,134,186,194,247],"new":[49],"large-scale,":[50],"content-rich":[51],"dataset":[53,192],"domain.":[57,240],"RSVLM-QA":[58,169,224,243],"constructed":[60],"integrating":[62],"data":[63],"from":[64,143,154],"several":[65],"prominent":[66],"segmentation":[68,146],"detection":[70],"datasets:":[71],"WHU,":[72],"LoveDA,":[73],"INRIA,":[74],"iSAID.":[76],"We":[77,184,241],"employ":[78],"an":[79],"innovative":[80],"dual-track":[81],"generation":[83,267],"pipeline.":[84],"Firstly,":[85],"we":[86,131,211],"leverage":[87],"Large":[88],"Models":[90,220],"(LLMs),":[91],"specifically":[92],"GPT-4.1,":[93],"with":[94,160,196],"meticulously":[95],"designed":[96],"prompts":[97],"to":[98,120,164,259],"automatically":[99],"generate":[100],"suite":[102],"detailed":[104,187],"annotations":[105,179],"including":[106],"image":[107],"captions,":[108],"spatial":[109],"relations,":[110],"semantic":[112],"tags,":[113],"alongside":[114],"complex":[115],"caption-based":[116],"pairs.":[118,168],"Secondly,":[119],"address":[121],"challenging":[123],"task":[124],"object":[126,140],"counting":[127,166],"imagery,":[130],"have":[132],"developed":[133],"specialized":[135],"automated":[136],"process":[137],"that":[138,223],"extracts":[139],"counts":[141],"directly":[142],"original":[145],"data;":[147],"GPT-4.1":[148],"then":[149],"formulates":[150],"natural":[151],"language":[152],"answers":[153],"these":[155],"counts,":[156],"which":[157],"paired":[159],"preset":[161],"templates":[163],"create":[165],"QA":[167],"comprises":[170],"13,820":[171],"images":[172],"162,373":[174],"pairs,":[176],"featuring":[177],"extensive":[178],"diverse":[181],"types.":[183],"provide":[185],"statistical":[188],"analysis":[189],"comparison":[195],"benchmarks,":[200],"highlighting":[201],"superior":[203],"depth":[204],"breadth":[206],"RSVLM-QA's":[208],"annotations.":[209],"Furthermore,":[210],"conduct":[212],"benchmark":[213,270],"experiments":[214],"on":[215],"Six":[216],"mainstream":[217],"(VLMs),":[221],"demonstrating":[222],"effectively":[225],"evaluates":[226],"challenges":[228],"understanding":[230],"abilities":[233],"current":[235],"VLMs":[236],"believe":[242],"will":[244],"serve":[245],"as":[246],"resource":[249],"VLM":[255],"research":[256],"communities,":[257],"poised":[258],"catalyze":[260],"advancements":[261],"field.":[264],"The":[265],"code,":[268],"models":[271],"publicly":[273],"available":[274],"at":[275],"https://github.com/StarZi0213/RSVLM-QA.":[276]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
