{"id":"https://openalex.org/W4412081439","doi":"https://doi.org/10.1109/lsp.2025.3586552","title":"X-STA: Cross-Modal Spatial-Temporal Alignment Network for Unified Audio-Visual Segmentation","display_name":"X-STA: Cross-Modal Spatial-Temporal Alignment Network for Unified Audio-Visual Segmentation","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4412081439","doi":"https://doi.org/10.1109/lsp.2025.3586552"},"language":"en","primary_location":{"id":"doi:10.1109/lsp.2025.3586552","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3586552","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011876855","display_name":"Hanyu Xuan","orcid":"https://orcid.org/0000-0002-4633-2794"},"institutions":[{"id":"https://openalex.org/I143868143","display_name":"Anhui University","ror":"https://ror.org/05th6yx34","country_code":"CN","type":"education","lineage":["https://openalex.org/I143868143"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hanyu Xuan","raw_affiliation_strings":["School of Big Data and Statistics, Anhui University, Hefei, China"],"affiliations":[{"raw_affiliation_string":"School of Big Data and Statistics, Anhui University, Hefei, China","institution_ids":["https://openalex.org/I143868143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081564168","display_name":"Tongxing Liu","orcid":"https://orcid.org/0009-0009-6978-4943"},"institutions":[{"id":"https://openalex.org/I143868143","display_name":"Anhui University","ror":"https://ror.org/05th6yx34","country_code":"CN","type":"education","lineage":["https://openalex.org/I143868143"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tongxing Liu","raw_affiliation_strings":["School of Big Data and Statistics, Anhui University, Hefei, China"],"affiliations":[{"raw_affiliation_string":"School of Big Data and Statistics, Anhui University, Hefei, China","institution_ids":["https://openalex.org/I143868143"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043669751","display_name":"Wenxiang Dong","orcid":"https://orcid.org/0000-0002-0272-1069"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wenxiang Dong","raw_affiliation_strings":["Institute of Dataspace, Hefei Comprehensive National Science Center, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Institute of Dataspace, Hefei Comprehensive National Science Center, Hefei, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054072550","display_name":"Zhongheng Li","orcid":"https://orcid.org/0000-0001-7091-9600"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongheng Li","raw_affiliation_strings":["School of Cyberspace Security, University of Science and Technology of China, Hefei, Anhui, China"],"affiliations":[{"raw_affiliation_string":"School of Cyberspace Security, University of Science and Technology of China, Hefei, Anhui, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078901325","display_name":"Shuo Chen","orcid":"https://orcid.org/0000-0001-8140-0409"},"institutions":[{"id":"https://openalex.org/I308837","display_name":"Suzhou University of Science and Technology","ror":"https://ror.org/04en8wb91","country_code":"CN","type":"education","lineage":["https://openalex.org/I308837"]},{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuo Chen","raw_affiliation_strings":["School of Intelligence Science and Technology, Nanjing University, Suzhou, Jiangsu, China"],"affiliations":[{"raw_affiliation_string":"School of Intelligence Science and Technology, Nanjing University, Suzhou, Jiangsu, China","institution_ids":["https://openalex.org/I308837","https://openalex.org/I881766915"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5011876855"],"corresponding_institution_ids":["https://openalex.org/I143868143"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.23281119,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"32","issue":null,"first_page":"2883","last_page":"2887"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.987500011920929,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7720999717712402},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.7091145515441895},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.7051838636398315},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5996843576431274},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5094216465950012},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.49173906445503235},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4769924283027649},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.4251485764980316},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.21828508377075195},{"id":"https://openalex.org/keywords/speech-coding","display_name":"Speech coding","score":0.20097926259040833},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.15611806511878967}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7720999717712402},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.7091145515441895},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.7051838636398315},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5996843576431274},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5094216465950012},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.49173906445503235},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4769924283027649},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4251485764980316},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.21828508377075195},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.20097926259040833},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.15611806511878967},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2025.3586552","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3586552","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4302041946","display_name":null,"funder_award_id":"62302006","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8579109809","display_name":null,"funder_award_id":"2308085QF221","funder_id":"https://openalex.org/F4320334897","funder_display_name":"Natural Science Foundation of Anhui Province"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320334897","display_name":"Natural Science Foundation of Anhui Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W2526050071","https://openalex.org/W2605670418","https://openalex.org/W2964109005","https://openalex.org/W2990113535","https://openalex.org/W2997909293","https://openalex.org/W3080686309","https://openalex.org/W3105352633","https://openalex.org/W3108367559","https://openalex.org/W3110606395","https://openalex.org/W3118120400","https://openalex.org/W3153906112","https://openalex.org/W3170088426","https://openalex.org/W3170630188","https://openalex.org/W3198371949","https://openalex.org/W3212022073","https://openalex.org/W3214311327","https://openalex.org/W4226206782","https://openalex.org/W4226314236","https://openalex.org/W4236362309","https://openalex.org/W4285606400","https://openalex.org/W4312415534","https://openalex.org/W4368754857","https://openalex.org/W4386066541","https://openalex.org/W4386072368","https://openalex.org/W4386083110","https://openalex.org/W4390190100","https://openalex.org/W4390872203","https://openalex.org/W4390874575","https://openalex.org/W4391594029","https://openalex.org/W4393159092","https://openalex.org/W4393159433","https://openalex.org/W4393160420","https://openalex.org/W4393178524","https://openalex.org/W4394593115","https://openalex.org/W4394625876","https://openalex.org/W4406657543","https://openalex.org/W6757817989","https://openalex.org/W6765831684","https://openalex.org/W6783539077","https://openalex.org/W6793746569","https://openalex.org/W6803674551","https://openalex.org/W6804185262","https://openalex.org/W6840058269","https://openalex.org/W6852820501"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"Audio-Visual":[0],"Segmentation":[1],"(<italic":[2,82,105,123,140],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[3,83,106,124,141],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">AVS</i>)":[4],"aims":[5],"to":[6,61,69,86],"segment":[7],"sound":[8,24,165],"sources":[9,25],"from":[10,111],"video":[11],"frames":[12,27],"using":[13],"synchronized":[14],"audio":[15],"cues.":[16,174],"This":[17],"task":[18,71],"requires":[19],"not":[20],"only":[21],"localizing":[22],"the":[23,164,169,192],"within":[26],"but":[28],"also":[29],"accurately":[30],"delineating":[31],"their":[32,59],"shapes.":[33],"Existing":[34],"AVS":[35,131,185],"methods":[36],"often":[37],"rely":[38],"on":[39,183],"assumptions":[40],"of":[41,163,171,194],"spatial-temporal":[42,88,173],"consistency":[43],"between":[44],"audio-visual":[45],"content":[46],"and":[47,66,67,90,136,148,160],"are":[48],"typically":[49],"designed":[50],"for":[51,130],"specific":[52],"learning":[53],"paradigms.":[54],"However,":[55],"this":[56,74],"specialization":[57],"limits":[58],"ability":[60],"handle":[62],"multi-granularity":[63],"supervision":[64],"signals":[65],"adapt":[68],"diverse":[70],"requirements.":[72],"For":[73],"purpose,":[75],"we":[76],"propose":[77],"a":[78,100,112,137],"Cross-modal":[79,103,121,138],"Spatial-Temporal":[80],"Alignment":[81],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">X-STA</i>)":[84],"network":[85],"alleviate":[87],"inconsistency":[89],"overcome":[91],"paradigm-specific":[92],"constraint.":[93],"Our":[94,175],"X-STA":[95],"introduces":[96],"three":[97],"key":[98],"components:":[99],"novel":[101],"multi-stage":[102],"Adapter":[104],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">xAdapter</i>)":[107],"that":[108,126,144],"transfers":[109],"knowledge":[110],"pre-trained":[113],"SAM":[114],"through":[115,132],"multi-grained":[116],"representation":[117],"adaptation,":[118],"an":[119],"innovative":[120],"Prompter":[122],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">xPrompter</i>)":[125],"provides":[127],"geometry-aware":[128],"constraints":[129],"dynamic":[133],"prompting":[134],"strategies,":[135],"Self-supervised":[139],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">xSelf</i>)":[142],"mechanism":[143],"refines":[145],"temporal":[146],"alignment":[147,170],"enables":[149],"self-supervised":[150],"AVS.":[151,195],"These":[152],"components":[153],"collectively":[154],"facilitate":[155],"explicit":[156],"reasoning":[157],"about":[158],"location":[159],"geometric":[161],"shape":[162],"source":[166],"by":[167],"refining":[168],"cross-modal":[172],"method":[176],"achieves":[177],"competitive":[178],"performance":[179],"across":[180],"several":[181],"baselines":[182],"widely-used":[184],"datasets,":[186],"demonstrating":[187],"its":[188],"effectiveness":[189],"in":[190],"addressing":[191],"complexities":[193]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
