{"id":"https://openalex.org/W7108228557","doi":"https://doi.org/10.1109/lsp.2025.3639371","title":"Listening for \u201cYou\u201d: Enhancing Speech Image Retrieval via Target Speaker Extraction","display_name":"Listening for \u201cYou\u201d: Enhancing Speech Image Retrieval via Target Speaker Extraction","publication_year":2025,"publication_date":"2025-12-02","ids":{"openalex":"https://openalex.org/W7108228557","doi":"https://doi.org/10.1109/lsp.2025.3639371"},"language":null,"primary_location":{"id":"doi:10.1109/lsp.2025.3639371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3639371","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wenhao Yang","orcid":"https://orcid.org/0000-0003-1019-4446"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenhao Yang","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0003-1019-4446","affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jianguo Wei","orcid":"https://orcid.org/0000-0002-8964-9759"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianguo Wei","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-8964-9759","affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Wenhuan Lu","orcid":"https://orcid.org/0000-0002-7951-8907"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenhuan Lu","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0002-7951-8907","affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xinyue Song","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyue Song","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":null,"display_name":"Xianghu Yue","orcid":"https://orcid.org/0000-0003-3527-6034"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianghu Yue","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University, Tianjin, China"],"raw_orcid":"https://orcid.org/0000-0003-3527-6034","affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":0.9874,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.83259799,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":"33","issue":null,"first_page":"201","last_page":"205"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3522000014781952,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.3522000014781952,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2134000062942505,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.1826000064611435,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5184000134468079},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.48249998688697815},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.43309998512268066},{"id":"https://openalex.org/keywords/extractor","display_name":"Extractor","score":0.399399995803833},{"id":"https://openalex.org/keywords/active-listening","display_name":"Active listening","score":0.3939000070095062},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.3686999976634979},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.3659999966621399},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.3504999876022339}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8589000105857849},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6603999733924866},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5893999934196472},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5184000134468079},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.48249998688697815},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.43309998512268066},{"id":"https://openalex.org/C117978034","wikidata":"https://www.wikidata.org/wiki/Q5422192","display_name":"Extractor","level":2,"score":0.399399995803833},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.3939000070095062},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3686999976634979},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33160001039505005},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3098999857902527},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3012999892234802},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.288100004196167},{"id":"https://openalex.org/C90509273","wikidata":"https://www.wikidata.org/wiki/Q11012","display_name":"Robot","level":2,"score":0.2842999994754791},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2782000005245209},{"id":"https://openalex.org/C2221639","wikidata":"https://www.wikidata.org/wiki/Q2877","display_name":"Discrete cosine transform","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2612000107765198}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lsp.2025.3639371","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lsp.2025.3639371","pdf_url":null,"source":{"id":"https://openalex.org/S120629676","display_name":"IEEE Signal Processing Letters","issn_l":"1070-9908","issn":["1070-9908","1558-2361"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Signal Processing Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.41469287872314453,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2143169494","https://openalex.org/W2481240925","https://openalex.org/W2760103357","https://openalex.org/W2890964092","https://openalex.org/W2952218014","https://openalex.org/W2962862718","https://openalex.org/W2971709506","https://openalex.org/W2972541922","https://openalex.org/W2973049979","https://openalex.org/W2973062255","https://openalex.org/W3024869864","https://openalex.org/W3174311593","https://openalex.org/W3176445421","https://openalex.org/W3196698946","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4319862477","https://openalex.org/W4372342485","https://openalex.org/W4385823321","https://openalex.org/W4385823492","https://openalex.org/W4392903977","https://openalex.org/W4401597893","https://openalex.org/W4402112006","https://openalex.org/W4404787841","https://openalex.org/W4405022263","https://openalex.org/W4408346050","https://openalex.org/W4408352294","https://openalex.org/W4410737562","https://openalex.org/W4412536220"],"related_works":[],"abstract_inverted_index":{"Image":[0],"retrieval":[1],"using":[2],"spoken":[3],"language":[4],"cues":[5],"has":[6],"emerged":[7],"as":[8],"a":[9,25,33,49,69],"promising":[10],"direction":[11],"in":[12,18,45,116,136],"multimodal":[13,140],"perception,":[14],"yet":[15],"leveraging":[16],"speech":[17,43,88],"multi-speaker":[19,42],"scenarios":[20],"remains":[21],"challenging.":[22],"We":[23],"propose":[24],"novel":[26],"Target":[27,70],"Speaker":[28,71],"Speech-Image":[29],"Retrieval":[30,72],"task":[31],"and":[32,41,89,102,113,118,126,139],"framework":[34],"that":[35,105],"learns":[36],"the":[37,46,79,85,95],"relationship":[38],"between":[39],"images":[40,93],"signals":[44],"presence":[47],"of":[48,81],"target":[50,63,86],"speaker.":[51],"Our":[52,129],"method":[53,77],"integrates":[54],"pre-trained":[55],"self-supervised":[56],"audio":[57],"encoders":[58],"with":[59,92],"vision":[60],"models":[61],"via":[62],"speaker-aware":[64],"contrastive":[65],"learning,":[66],"conditioned":[67],"on":[68,100],"Extractor":[73],"(TSRE)":[74],"module.":[75],"This":[76],"enables":[78],"extraction":[80],"semantic":[82,97],"content":[83],"from":[84],"speaker's":[87],"aligns":[90],"it":[91],"representing":[94],"corresponding":[96],"meaning.":[98],"Experiments":[99],"SpokenCOCO2Mix":[101],"SpokenCOCO3Mix":[103],"show":[104],"TSRE":[106],"significantly":[107],"outperforms":[108],"existing":[109],"methods,":[110],"achieving":[111],"36.3%":[112],"29.9%":[114],"Recall@1":[115],"2-":[117],"3-speaker":[119],"scenarios,":[120],"respectively-substantial":[121],"improvements":[122],"over":[123],"single-speaker":[124],"baselines":[125],"state-of-the-art":[127],"models.":[128],"approach":[130],"demonstrates":[131],"potential":[132],"for":[133],"real-world":[134],"deployment":[135],"assistive":[137],"robotics":[138],"interaction":[141],"systems.":[142]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-28T09:10:13.091523","created_date":"2025-12-03T00:00:00"}
