{"id":"https://openalex.org/W7131884976","doi":"https://doi.org/10.48550/arxiv.2602.22923","title":"WaterVideoQA: ASV-Centric Perception and Rule-Compliant Reasoning via Multi-Modal Agents","display_name":"WaterVideoQA: ASV-Centric Perception and Rule-Compliant Reasoning via Multi-Modal Agents","publication_year":2026,"publication_date":"2026-02-26","ids":{"openalex":"https://openalex.org/W7131884976","doi":"https://doi.org/10.48550/arxiv.2602.22923"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.22923","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079695819","display_name":"Runwei Guan","orcid":"https://orcid.org/0000-0003-4013-2107"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Guan, Runwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127166692","display_name":"Shaofeng Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Shaofeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104264340","display_name":"Ningwei Ouyang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ouyang, Ningwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113164789","display_name":"Weichen Fei","orcid":"https://orcid.org/0009-0002-0993-6270"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fei, Weichen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127207628","display_name":"Shanliang Yao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yao, Shanliang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127134902","display_name":"Wei Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127026063","display_name":"Chenhao Ge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Chenhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114594168","display_name":"Penglei Sun","orcid":"https://orcid.org/0009-0005-2290-0944"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Penglei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127076858","display_name":"Xiaohui Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xiaohui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127261356","display_name":"Tao Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127309928","display_name":"Ryan Wen Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ryan Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127188729","display_name":"Hui Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Hui","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5079695819"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9556000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9556000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.004699999932199717,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11622","display_name":"Maritime Navigation and Safety","score":0.00279999990016222,"subfield":{"id":"https://openalex.org/subfields/2212","display_name":"Ocean Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6294999718666077},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.613099992275238},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.491100013256073},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4880000054836273},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.4569999873638153},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.41940000653266907},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.4009999930858612},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.3682999908924103}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6553999781608582},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6294999718666077},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.613099992275238},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5328999757766724},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.491100013256073},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4880000054836273},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.4569999873638153},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.41940000653266907},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.41780000925064087},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.4009999930858612},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.3682999908924103},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3650999963283539},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.3264999985694885},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3172999918460846},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C20854674","wikidata":"https://www.wikidata.org/wiki/Q4386060","display_name":"Cognitive architecture","level":3,"score":0.30559998750686646},{"id":"https://openalex.org/C170494330","wikidata":"https://www.wikidata.org/wiki/Q1778434","display_name":"Cognitive map","level":3,"score":0.3034999966621399},{"id":"https://openalex.org/C28063669","wikidata":"https://www.wikidata.org/wiki/Q7167042","display_name":"Perceptual system","level":3,"score":0.2957000136375427},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2867000102996826},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2782999873161316}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.22923","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.22923","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.22923","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.22923","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/14","display_name":"Life below water","score":0.6183779835700989}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"While":[0],"autonomous":[1],"navigation":[2],"has":[3],"achieved":[4],"remarkable":[5],"success":[6],"in":[7,22,173],"passive":[8],"perception":[9,43],"(e.g.,":[10],"object":[11],"detection":[12],"and":[13,44,64,104,140],"segmentation),":[14],"it":[15],"remains":[16],"fundamentally":[17],"constrained":[18],"by":[19],"a":[20,54,113,122,166],"void":[21],"knowledge-driven,":[23],"interactive":[24],"environmental":[25],"cognition.":[26],"In":[27],"the":[28,34,38,73],"high-stakes":[29],"domain":[30],"of":[31],"maritime":[32,130,175],"navigation,":[33],"ability":[35],"to":[36,61,107,151],"bridge":[37],"gap":[39],"between":[40],"raw":[41],"visual":[42],"complex":[45],"cognitive":[46,116],"reasoning":[47],"is":[48],"not":[49],"merely":[50],"an":[51],"enhancement":[52],"but":[53],"critical":[55],"prerequisite":[56],"for":[57,83,128,169],"Autonomous":[58,141],"Surface":[59],"Vessels":[60],"execute":[62],"safe":[63],"precise":[65],"maneuvers.":[66],"To":[67],"this":[68],"end,":[69],"we":[70,119],"present":[71],"WaterVideoQA,":[72],"first":[74],"large-scale,":[75],"comprehensive":[76],"Video":[77],"Question":[78],"Answering":[79],"benchmark":[80,87],"specifically":[81],"engineered":[82],"all-waterway":[84],"environments.":[85,176],"This":[86],"encompasses":[88],"3,029":[89],"video":[90],"clips":[91],"across":[92,112],"six":[93],"distinct":[94],"waterway":[95],"categories,":[96],"integrating":[97],"multifaceted":[98],"variables":[99],"such":[100],"as":[101],"volatile":[102],"lighting":[103],"dynamic":[105,174],"weather":[106],"rigorously":[108],"stress-test":[109],"ASV":[110],"capabilities":[111],"five-tier":[114],"hierarchical":[115],"framework.":[117],"Furthermore,":[118],"introduce":[120],"NaviMind,":[121],"pioneering":[123],"multi-agent":[124],"neuro-symbolic":[125],"system":[126],"designed":[127],"open-ended":[129],"reasoning.":[131],"By":[132],"synergizing":[133],"Adaptive":[134],"Semantic":[135],"Routing,":[136],"Situation-Aware":[137],"Hierarchical":[138],"Reasoning,":[139],"Self-Reflective":[142],"Verification,":[143],"NaviMind":[144],"transitions":[145],"ASVs":[146],"from":[147],"superficial":[148],"pattern":[149],"matching":[150],"regulation-compliant,":[152],"interpretable":[153],"decision-making.":[154],"Experimental":[155],"results":[156],"demonstrate":[157],"that":[158],"our":[159],"framework":[160],"significantly":[161],"transcends":[162],"existing":[163],"baselines,":[164],"establishing":[165],"new":[167],"paradigm":[168],"intelligent,":[170],"trustworthy":[171],"interaction":[172]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-28T00:00:00"}
