{"id":"https://openalex.org/W4293202987","doi":"https://doi.org/10.1109/tmm.2022.3171090","title":"A Reconstruction-Based Visual-Acoustic-Semantic Embedding Method for Speech-Image Retrieval","display_name":"A Reconstruction-Based Visual-Acoustic-Semantic Embedding Method for Speech-Image Retrieval","publication_year":2022,"publication_date":"2022-04-28","ids":{"openalex":"https://openalex.org/W4293202987","doi":"https://doi.org/10.1109/tmm.2022.3171090"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2022.3171090","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2022.3171090","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101985185","display_name":"Wenlong Cheng","orcid":"https://orcid.org/0000-0001-9619-4935"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenlong Cheng","raw_affiliation_strings":["Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"],"affiliations":[{"raw_affiliation_string":"Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055653308","display_name":"Wei Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wei Tang","raw_affiliation_strings":["Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"],"affiliations":[{"raw_affiliation_string":"Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101550628","display_name":"Yan Huang","orcid":"https://orcid.org/0000-0002-8239-7229"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Huang","raw_affiliation_strings":["Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China"],"affiliations":[{"raw_affiliation_string":"Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110713758","display_name":"Yiwen Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiwen Luo","raw_affiliation_strings":["Institute of Artificial Intelligence and Robotics (IAIR), Xi&#x2019;an Jiaotong University (XJTU), Xi&#x2019;an, China"],"affiliations":[{"raw_affiliation_string":"Institute of Artificial Intelligence and Robotics (IAIR), Xi&#x2019;an Jiaotong University (XJTU), Xi&#x2019;an, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5115602506","display_name":"Liang Wang","orcid":"https://orcid.org/0000-0001-5224-8647"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210097554","display_name":"Center for Excellence in Brain Science and Intelligence Technology","ror":"https://ror.org/00vpwhm04","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210097554"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Liang Wang","raw_affiliation_strings":["Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China","CAS Center for Excellence in Brain Science and Intelligence Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Center for Research on Intelligent Perception and Computing (CRIPAC), National Laboratory of Pattern Recognition (NLPR), Institute of Automation, Chinese Academy of Sciences (CASIA), Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences (UCAS), Beijing, China","institution_ids":["https://openalex.org/I4210165038"]},{"raw_affiliation_string":"CAS Center for Excellence in Brain Science and Intelligence Technology, Beijing, China","institution_ids":["https://openalex.org/I4210097554"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5101985185"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210112150","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.1021,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.38611745,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"25","issue":null,"first_page":"4067","last_page":"4080"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8052829504013062},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5595873594284058},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49324315786361694},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.49075302481651306},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.47933992743492126},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.449605256319046},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.4215448498725891},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.36272361874580383},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3483424186706543}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8052829504013062},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5595873594284058},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49324315786361694},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.49075302481651306},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.47933992743492126},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.449605256319046},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.4215448498725891},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36272361874580383},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3483424186706543},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2022.3171090","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2022.3171090","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5099999904632568,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G5273665290","display_name":null,"funder_award_id":"Z201100006820079","funder_id":"https://openalex.org/F4320334978","funder_display_name":"Beijing Nova Program"},{"id":"https://openalex.org/G6431871145","display_name":null,"funder_award_id":"61721004","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7145271284","display_name":null,"funder_award_id":"61976132","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320334978","display_name":"Beijing Nova Program","ror":"https://ror.org/034k14f91"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":77,"referenced_works":["https://openalex.org/W68733909","https://openalex.org/W1522301498","https://openalex.org/W1527575280","https://openalex.org/W1686810756","https://openalex.org/W1772478219","https://openalex.org/W1895577753","https://openalex.org/W1900086069","https://openalex.org/W1933349210","https://openalex.org/W1965891547","https://openalex.org/W1981109290","https://openalex.org/W2053667957","https://openalex.org/W2064675550","https://openalex.org/W2097117768","https://openalex.org/W2102605133","https://openalex.org/W2112912048","https://openalex.org/W2123024445","https://openalex.org/W2124964692","https://openalex.org/W2134670479","https://openalex.org/W2149557440","https://openalex.org/W2156833313","https://openalex.org/W2157331557","https://openalex.org/W2165022036","https://openalex.org/W2167630669","https://openalex.org/W2183341477","https://openalex.org/W2194775991","https://openalex.org/W2211092169","https://openalex.org/W2296681920","https://openalex.org/W2474574787","https://openalex.org/W2546190447","https://openalex.org/W2556930864","https://openalex.org/W2745461083","https://openalex.org/W2774267535","https://openalex.org/W2796315435","https://openalex.org/W2883311563","https://openalex.org/W2896348597","https://openalex.org/W2897464551","https://openalex.org/W2898008292","https://openalex.org/W2942614241","https://openalex.org/W2949680570","https://openalex.org/W2962753610","https://openalex.org/W2962862718","https://openalex.org/W2962964995","https://openalex.org/W2962968152","https://openalex.org/W2963446712","https://openalex.org/W2963863119","https://openalex.org/W2963902314","https://openalex.org/W2964120214","https://openalex.org/W2964138343","https://openalex.org/W2979304729","https://openalex.org/W2987489329","https://openalex.org/W2988823324","https://openalex.org/W2997531716","https://openalex.org/W2997786074","https://openalex.org/W3003260174","https://openalex.org/W3005881764","https://openalex.org/W3035454331","https://openalex.org/W3035485997","https://openalex.org/W3035605030","https://openalex.org/W3110042533","https://openalex.org/W3116109061","https://openalex.org/W3118548710","https://openalex.org/W3119545279","https://openalex.org/W4243164704","https://openalex.org/W4294576248","https://openalex.org/W4320013936","https://openalex.org/W6631190155","https://openalex.org/W6631516269","https://openalex.org/W6637373629","https://openalex.org/W6676647902","https://openalex.org/W6678470764","https://openalex.org/W6678856934","https://openalex.org/W6679792166","https://openalex.org/W6729559666","https://openalex.org/W6729977899","https://openalex.org/W6738806211","https://openalex.org/W6747225742","https://openalex.org/W6787924464"],"related_works":["https://openalex.org/W2081900870","https://openalex.org/W2068608913","https://openalex.org/W3124914020","https://openalex.org/W2141033859","https://openalex.org/W2085384747","https://openalex.org/W2077542787","https://openalex.org/W2071701083","https://openalex.org/W2150136235","https://openalex.org/W2041353081","https://openalex.org/W2581240705"],"abstract_inverted_index":{"Speech-image":[0],"retrieval":[1,126],"aims":[2],"at":[3],"learning":[4],"the":[5,23,59,64,69,89,109,124,129],"relevance":[6],"between":[7,27,71,92],"image":[8,72],"and":[9,29,73,103,132,135],"speech.Prior":[10],"approaches":[11],"are":[12],"mainly":[13],"based":[14,81],"on":[15,82,123,128],"bi-modal":[16],"contrastive":[17],"learning,":[18],"which":[19,62],"can":[20,86],"not":[21],"alleviate":[22,88],"cross-modal":[24],"heterogeneous":[25,90],"issue":[26,91],"visual":[28],"acoustic":[30,60],"modalities":[31,95],"well.":[32],"To":[33],"address":[34],"this":[35],"issue,":[36],"we":[37,45,76],"propose":[38,46],"a":[39,47,78],"visual-acoustic-semantic":[40],"embedding":[41],"(VASE)":[42],"method.":[43,114],"First,":[44],"tri-modal":[48],"ranking":[49],"loss":[50,80],"by":[51],"taking":[52],"advantage":[53],"of":[54,111],"semantic":[55],"information":[56],"corresponding":[57],"to":[58,67],"data,":[61],"introduces":[63],"auxiliary":[65],"alignment":[66,70],"enhance":[68],"speech.":[74],"Second,":[75],"introduce":[77],"cycle-consistency":[79],"feature":[83],"reconstruction.":[84],"It":[85],"further":[87],"different":[93],"data":[94],"(":[96],"<italic":[97,138],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[98,139],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">e.g.</i>":[99],",":[100,142],"visual-acoustic,":[101],"visual-textual":[102],"acoustic-textual).":[104],"Extensive":[105],"experiments":[106],"have":[107],"demonstrated":[108],"effectiveness":[110],"our":[112,117],"proposed":[113],"In":[115],"addition,":[116],"VASE":[118],"model":[119],"achieves":[120],"state-of-the-art":[121],"performance":[122],"speech-image":[125],"task":[127],"Flickr8K":[130],"[Harwath":[131,137],"Glass,":[133],"2015]s":[134],"Places":[136],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">et":[140],"al.</i>":[141],"2018]":[143],"datasets.":[144]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
