{"id":"https://openalex.org/W7164856362","doi":"https://doi.org/10.1145/3805622.3810818","title":"RoATR: A Systematic Study of Audio-Text Retrieval Robustness Against Realistic Perturbations","display_name":"RoATR: A Systematic Study of Audio-Text Retrieval Robustness Against Realistic Perturbations","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164856362","doi":"https://doi.org/10.1145/3805622.3810818"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810818","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810818","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810818","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100671939","display_name":"Honglei Zhang","orcid":"https://orcid.org/0000-0002-8229-852X"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Honglei Zhang","raw_affiliation_strings":["School of Software, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0000-1711-3018","affiliations":[{"raw_affiliation_string":"School of Software, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025111079","display_name":"Pengfei Zhou","orcid":"https://orcid.org/0000-0003-1836-1122"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengfei Zhou","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0004-9028-674X","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053275235","display_name":"Ruohan Wang","orcid":"https://orcid.org/0009-0000-9152-582X"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ruohan Wang","raw_affiliation_strings":["Northwestern Polytechnical University, School of Software, Xi'an, China"],"raw_orcid":"https://orcid.org/0009-0003-1957-3677","affiliations":[{"raw_affiliation_string":"Northwestern Polytechnical University, School of Software, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072995948","display_name":"Siyue Zhang","orcid":"https://orcid.org/0000-0001-9406-6745"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Siyue Zhang","raw_affiliation_strings":["College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore"],"raw_orcid":"https://orcid.org/0000-0001-9406-6745","affiliations":[{"raw_affiliation_string":"College of Computing and Data Science, Nanyang Technological University, Singapore, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061007704","display_name":"Yilei Shi","orcid":"https://orcid.org/0000-0001-7386-0026"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yilei Shi","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0001-7386-0026","affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi'an, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.9506583,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"167","last_page":"176"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.484499990940094,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.484499990940094,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.3743000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.06319999694824219,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6826000213623047},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4837999939918518},{"id":"https://openalex.org/keywords/perturbation","display_name":"Perturbation (astronomy)","score":0.33320000767707825},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.30559998750686646},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.28769999742507935}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7059999704360962},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6826000213623047},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4837999939918518},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38769999146461487},{"id":"https://openalex.org/C177918212","wikidata":"https://www.wikidata.org/wiki/Q803623","display_name":"Perturbation (astronomy)","level":2,"score":0.33320000767707825},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.30720001459121704},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.28769999742507935},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2630999982357025},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2442999929189682}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810818","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810818","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810818","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810818","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4726071059703827}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1984076147","https://openalex.org/W1989314204","https://openalex.org/W2023217403","https://openalex.org/W2052666245","https://openalex.org/W2146334809","https://openalex.org/W2612601884","https://openalex.org/W2889787757","https://openalex.org/W2912924812","https://openalex.org/W2963957489","https://openalex.org/W2963961878","https://openalex.org/W2972541922","https://openalex.org/W3015591594","https://openalex.org/W3020336359","https://openalex.org/W3160475509","https://openalex.org/W3163573274","https://openalex.org/W3176445421","https://openalex.org/W3209984917","https://openalex.org/W3211424380","https://openalex.org/W4221156109","https://openalex.org/W4284898017","https://openalex.org/W4386488973","https://openalex.org/W4400033239","https://openalex.org/W4402670856","https://openalex.org/W4411119258","https://openalex.org/W4412945327","https://openalex.org/W4415432847","https://openalex.org/W4415539635","https://openalex.org/W4416036849"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"retrieval":[1,41],"enables":[2],"effective":[3],"access":[4],"to":[5,143],"information":[6],"across":[7,135],"diverse":[8],"modalities":[9],"such":[10],"as":[11,177],"text,":[12],"images,":[13],"audio,":[14],"and":[15,39,51,120,131],"video.":[16],"Audio\u2013Text":[17],"Retrieval":[18],"(ATR),":[19],"which":[20,179],"retrieves":[21],"relevant":[22],"text":[23],"given":[24],"an":[25,181],"audio":[26,50],"query,":[27],"is":[28],"a":[29,55,85,144],"key":[30],"component":[31],"in":[32,54,100,148,188],"real-world":[33],"applications":[34],"including":[35,115],"voice-enabled":[36],"AI":[37],"assistants":[38],"audio-integrated":[40],"systems.":[42],"Existing":[43],"ATR":[44,93,127,161],"methods":[45],"primarily":[46],"focus":[47],"on":[48,125,169],"aligning":[49],"textual":[52],"semantics":[53],"shared":[56],"embedding":[57],"space":[58],"through":[59],"contrastive":[60],"learning":[61],"or":[62],"multimodal":[63],"large":[64],"language":[65],"model":[66],"backbones,":[67],"while":[68,150],"their":[69],"robustness":[70,98,159],"under":[71,153],"realistic":[72,112],"acoustic":[73],"conditions":[74],"remains":[75],"largely":[76],"unexplored.":[77],"To":[78,156],"bridge":[79],"this":[80],"gap,":[81],"we":[82,163],"introduce":[83],"RoATR,":[84,178],"diagnostic":[86],"benchmark":[87],"for":[88],"evaluating":[89],"the":[90,158,174],"Robustness":[91],"of":[92,160,184],"systems,":[94],"inspired":[95],"by":[96],"prior":[97],"studies":[99],"automatic":[101],"speech":[102],"recognition.":[103],"RoATR":[104],"defines":[105],"five":[106],"perturbation":[107,113,136],"categories":[108],"that":[109],"cover":[110],"eleven":[111],"types,":[114],"background":[116],"noise,":[117],"speaker":[118],"variability,":[119],"device":[121],"distortion.":[122],"Extensive":[123],"experiments":[124],"state-of-the-art":[126],"models":[128],"reveal":[129],"substantial":[130],"heterogeneous":[132],"performance":[133],"degradation":[134,152],"types.":[137],"For":[138],"example,":[139],"LCO-Embedding":[140],"suffers":[141],"up":[142],"28.0%":[145],"absolute":[146],"drop":[147],"Recall@1":[149],"no":[151],"city":[154],"noise.":[155],"improve":[157],"models,":[162],"further":[164],"propose":[165],"perturbation-augmented":[166],"training":[167],"based":[168],"perturbed":[170],"data":[171],"curated":[172],"following":[173],"same":[175],"pipeline":[176],"achieves":[180],"average":[182],"improvement":[183],"16.0":[185],"percentage":[186],"points":[187],"Recall@1.":[189]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
