{"id":"https://openalex.org/W7134823371","doi":"https://doi.org/10.48550/arxiv.2603.08224","title":"SAVE: Speech-Aware Video Representation Learning for Video-Text Retrieval","display_name":"SAVE: Speech-Aware Video Representation Learning for Video-Text Retrieval","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7134823371","doi":"https://doi.org/10.48550/arxiv.2603.08224"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.08224","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08224","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.08224","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122219279","display_name":"Ruixiang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Ruixiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128638450","display_name":"Zhihao Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhihao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128634717","display_name":"Bangxiang Lan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lan, Bangxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113398435","display_name":"Zijie Xin","orcid":"https://orcid.org/0000-0002-9220-8735"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xin, Zijie","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128671646","display_name":"Jingyu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jingyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5060270456","display_name":"Xirong Li","orcid":"https://orcid.org/0000-0002-0220-8310"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xirong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.40149998664855957,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.40149998664855957,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.35339999198913574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.05429999902844429,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6123999953269958},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5989999771118164},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.44179999828338623},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.33559998869895935},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.31850001215934753},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.29739999771118164},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.27790001034736633}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8377000093460083},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6236000061035156},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6123999953269958},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5989999771118164},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5458999872207642},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.44179999828338623},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.362199991941452},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.32820001244544983},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.31850001215934753},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.29739999771118164},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.27790001034736633},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C2992317946","wikidata":"https://www.wikidata.org/wiki/Q712144","display_name":"De facto","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C54953205","wikidata":"https://www.wikidata.org/wiki/Q4142201","display_name":"Speech analytics","level":4,"score":0.26660001277923584},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2644999921321869},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.08224","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08224","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.08224","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08224","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.5422266721725464,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"For":[0],"video-text":[1],"retrieval,":[2],"the":[3,32,130,154],"use":[4],"of":[5,35,68,153],"CLIP":[6,14],"has":[7,23],"been":[8,41],"a":[9,26,83,94,99],"de":[10],"facto":[11],"choice.":[12],"Since":[13],"provides":[15],"only":[16],"image":[17],"and":[18,53,71,147],"text":[19],"encoders,":[20],"this":[21],"consensus":[22],"led":[24],"to":[25,43],"biased":[27],"paradigm":[28],"that":[29,116,125],"entirely":[30],"ignores":[31],"sound":[33],"track":[34],"videos.":[36],"While":[37],"several":[38],"attempts":[39],"have":[40],"made":[42],"reintroduce":[44],"audio":[45,51],"--":[46,60],"typically":[47],"by":[48,134],"incorporating":[49],"an":[50],"encoder":[52],"fusing":[54],"its":[55],"output":[56],"with":[57,98],"visual":[58],"features":[59],"these":[61,77],"methods":[62],"face":[63],"two":[64],"challenges:":[65],"ineffective":[66],"representation":[67],"speech":[69,101,106],"content":[70],"suboptimal":[72],"vision-audio":[73,114],"fusion.":[74,118],"To":[75],"address":[76],"issues":[78],"jointly,":[79],"we":[80,109],"propose":[81],"SAVE,":[82],"Speech":[84],"Aware":[85],"Video":[86],"rEpresentation":[87],"learning":[88],"method.":[89],"SAVE":[90,126],"improves":[91],"upon":[92],"AVIGATE,":[93],"SOTA":[95],"audiovisual":[96],"method,":[97],"dedicated":[100],"branch":[102],"for":[103,112],"more":[104],"effective":[105],"embedding.":[107],"Furthermore,":[108],"introduce":[110],"soft-ALBEF":[111],"early":[113],"alignment":[115],"facilitates":[117],"Extensive":[119],"experiments":[120],"on":[121,136,139,142,145,149],"five":[122],"benchmarks":[123],"show":[124],"compares":[127],"favorably":[128],"against":[129],"SOTA,":[131],"outperforming":[132],"AVIGATE":[133],"+4.1%":[135],"MSRVTT-9k,":[137],"+1.9%":[138],"MSRVTT-7k,":[140],"+2.5%":[141],"VATEX,":[143],"+9.8%":[144],"Charades,":[146],"+2.1%":[148],"LSMDC,":[150],"in":[151],"light":[152],"SumR":[155],"metric.":[156]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-11T00:00:00"}
