{"id":"https://openalex.org/W7133325740","doi":"https://doi.org/10.48550/arxiv.2603.01431","title":"SeaVIS: Sound-Enhanced Association for Online Audio-Visual Instance Segmentation","display_name":"SeaVIS: Sound-Enhanced Association for Online Audio-Visual Instance Segmentation","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133325740","doi":"https://doi.org/10.48550/arxiv.2603.01431"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01431","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01431","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01431","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103720145","display_name":"Yingjian Zhu","orcid":"https://orcid.org/0009-0003-5538-0645"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Yingjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127965172","display_name":"Ying Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Ying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122762722","display_name":"yuyang hong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hong, Yuyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088611866","display_name":"Ruohao Guo","orcid":"https://orcid.org/0000-0002-1091-272X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Ruohao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122843026","display_name":"Kun Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Kun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127930370","display_name":"Xin Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Xin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127969591","display_name":"Bin Fan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fan, Bin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127873919","display_name":"Shiming Xiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiang, Shiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.38600000739097595,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.38600000739097595,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.33090001344680786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0723000019788742,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6848000288009644},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.609000027179718},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5440000295639038},{"id":"https://openalex.org/keywords/association","display_name":"Association (psychology)","score":0.46869999170303345},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.45570001006126404},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.42590001225471497}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.79339998960495},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6848000288009644},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6406000256538391},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.609000027179718},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5440000295639038},{"id":"https://openalex.org/C142853389","wikidata":"https://www.wikidata.org/wiki/Q744778","display_name":"Association (psychology)","level":2,"score":0.46869999170303345},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4575999975204468},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.45570001006126404},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.42590001225471497},{"id":"https://openalex.org/C193524817","wikidata":"https://www.wikidata.org/wiki/Q386780","display_name":"Association rule learning","level":2,"score":0.3783000111579895},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3303999900817871},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.32899999618530273},{"id":"https://openalex.org/C2777851325","wikidata":"https://www.wikidata.org/wiki/Q7094102","display_name":"Online model","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2800999879837036},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2768999934196472},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01431","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01431","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01431","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01431","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recently,":[0],"an":[1,111,130],"audio-visual":[2,61],"instance":[3,62,105,138,168],"segmentation":[4,121],"(AVIS)":[5],"task":[6],"has":[7],"been":[8],"introduced,":[9],"aiming":[10],"to":[11,73,108,136],"identify,":[12],"segment":[13],"and":[14,114],"track":[15],"individual":[16],"sounding":[17,113,148],"instances":[18,33,153],"in":[19,118],"videos.":[20],"However,":[21],"prevailing":[22],"methods":[23,101],"primarily":[24],"adopt":[25],"the":[26,55,66,83,87,119,174,183],"offline":[27],"paradigm,":[28],"that":[29,43,103,140,158,187],"cannot":[30],"associate":[31],"detected":[32],"across":[34,193],"consecutive":[35],"clips,":[36],"making":[37],"them":[38],"unsuitable":[39],"for":[40,60,98,204],"real-world":[41],"scenarios":[42],"involve":[44],"continuous":[45],"video":[46],"streams.":[47],"To":[48,125],"address":[49],"this":[50,151],"limitation,":[51],"we":[52,128],"introduce":[53],"SeaVIS,":[54],"first":[56],"online":[57,76],"framework":[58],"designed":[59],"segmentation.":[63],"SeaVIS":[64,188],"leverages":[65],"Causal":[67],"Cross":[68],"Attention":[69],"Fusion":[70],"(CCAF)":[71],"module":[72],"enable":[74],"efficient":[75],"processing,":[77],"which":[78],"integrates":[79],"visual":[80,144],"features":[81],"from":[82],"current":[84],"frame":[85],"with":[86],"entire":[88],"audio":[89],"history":[90],"under":[91],"strict":[92],"causal":[93],"constraints.":[94],"A":[95],"major":[96],"challenge":[97],"conventional":[99],"VIS":[100],"is":[102],"appearance-based":[104],"association":[106,169],"fails":[107],"distinguish":[109],"between":[110],"object's":[112],"silent":[115,123],"states,":[116],"resulting":[117],"incorrect":[120],"of":[122,177],"objects.":[124],"tackle":[126],"this,":[127],"employ":[129],"Audio-Guided":[131],"Contrastive":[132],"Learning":[133],"(AGCL)":[134],"strategy":[135],"generate":[137],"prototypes":[139],"encode":[141],"not":[142,160],"only":[143],"appearance":[145],"but":[146],"also":[147],"activity.":[149],"In":[150],"way,":[152],"preserved":[154],"during":[155,167],"per-frame":[156],"prediction":[157],"do":[159],"emit":[161],"sound":[162],"can":[163],"be":[164],"effectively":[165],"suppressed":[166],"process,":[170],"thereby":[171],"significantly":[172],"enhancing":[173],"audio-following":[175],"capability":[176],"SeaVIS.":[178],"Extensive":[179],"experiments":[180],"conducted":[181],"on":[182],"AVISeg":[184],"dataset":[185],"demonstrate":[186],"surpasses":[189],"existing":[190],"state-of-the-art":[191],"models":[192],"multiple":[194],"evaluation":[195],"metrics":[196],"while":[197],"maintaining":[198],"a":[199],"competitive":[200],"inference":[201],"speed":[202],"suitable":[203],"real-time":[205],"processing.":[206]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-04T00:00:00"}
