{"id":"https://openalex.org/W7157313371","doi":"https://doi.org/10.48550/arxiv.2604.23323","title":"Robust Audio-Text Retrieval via Cross-Modal Attention and Hybrid Loss","display_name":"Robust Audio-Text Retrieval via Cross-Modal Attention and Hybrid Loss","publication_year":2026,"publication_date":"2026-04-25","ids":{"openalex":"https://openalex.org/W7157313371","doi":"https://doi.org/10.48550/arxiv.2604.23323"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.23323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.23323","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016087081","display_name":"Meizhu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Meizhu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134814264","display_name":"Matthew Rowe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rowe, Matthew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134820273","display_name":"Amit Agarwal","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agarwal, Amit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134765530","display_name":"Michael Avendi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Avendi, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134764661","display_name":"Yassi Abbasi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abbasi, Yassi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134755916","display_name":"Hitesh Laxmichand Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Hitesh Laxmichand","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134796675","display_name":"Paul Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Paul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106859201","display_name":"Kyu J. Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Kyu J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134788831","display_name":"Tao Sheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheng, Tao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134811812","display_name":"Sujith Ravi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ravi, Sujith","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134799241","display_name":"Dan Roth","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roth, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5016087081"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8677999973297119,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.8677999973297119,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.05209999904036522,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.052000001072883606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.7099000215530396},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6119999885559082},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.45840001106262207},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4551999866962433},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.383899986743927},{"id":"https://openalex.org/keywords/discrete-cosine-transform","display_name":"Discrete cosine transform","score":0.36239999532699585},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.34310001134872437},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3249000012874603}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8212000131607056},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.7099000215530396},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6119999885559082},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5597000122070312},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.45840001106262207},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4551999866962433},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4235999882221222},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.384799987077713},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.383899986743927},{"id":"https://openalex.org/C2221639","wikidata":"https://www.wikidata.org/wiki/Q2877","display_name":"Discrete cosine transform","level":3,"score":0.36239999532699585},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C178009071","wikidata":"https://www.wikidata.org/wiki/Q93344","display_name":"Trigonometric functions","level":2,"score":0.32429999113082886},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C3073032","wikidata":"https://www.wikidata.org/wiki/Q15912075","display_name":"Information hiding","level":3,"score":0.2863999903202057},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.2858999967575073},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.27900001406669617},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.27810001373291016},{"id":"https://openalex.org/C2983174267","wikidata":"https://www.wikidata.org/wiki/Q3775098","display_name":"Video retrieval","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25450000166893005},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.23323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.23323","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.23323","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.7279878258705139,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Audio-text":[0],"retrieval":[1,47],"enables":[2],"semantic":[3],"alignment":[4],"between":[5],"audio":[6,31,51,100],"content":[7],"and":[8,18,28,39,52,66,83,98,108],"natural":[9],"language":[10],"queries,":[11],"supporting":[12],"applications":[13],"in":[14],"multimedia":[15],"search,":[16],"accessibility,":[17],"surveillance.":[19],"However,":[20],"current":[21],"state-of-the-art":[22],"approaches":[23],"struggle":[24],"with":[25],"long,":[26],"noisy,":[27],"weakly":[29],"labeled":[30],"due":[32],"to":[33,103],"their":[34],"reliance":[35],"on":[36,112],"contrastive":[37,84],"learning":[38],"large-batch":[40],"training.":[41],"We":[42],"propose":[43],"a":[44,56,75],"novel":[45],"multimodal":[46],"framework":[48],"that":[49],"refines":[50],"text":[53],"embeddings":[54],"using":[55],"cross-modal":[57],"embedding":[58],"refinement":[59],"module":[60],"combining":[61],"transformer-based":[62],"projection,":[63],"linear":[64],"mapping,":[65],"bidirectional":[67],"attention.":[68],"To":[69],"further":[70],"improve":[71],"robustness,":[72],"we":[73],"introduce":[74],"hybrid":[76],"loss":[77],"function":[78],"blending":[79],"cosine":[80],"similarity,":[81],"$\\mathcal{L}_{1}$,":[82],"objectives,":[85],"enabling":[86],"stable":[87],"training":[88],"even":[89],"under":[90],"small-batch":[91],"constraints.":[92],"Our":[93],"approach":[94],"efficiently":[95],"handles":[96],"long-form":[97],"noisy":[99],"(SNR":[101],"5":[102],"15)":[104],"via":[105],"silence-aware":[106],"chunking":[107],"attention-based":[109],"pooling.":[110],"Experiments":[111],"benchmark":[113],"datasets":[114],"demonstrate":[115],"improvements":[116],"over":[117],"prior":[118],"methods.":[119]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-29T00:00:00"}
