{"id":"https://openalex.org/W3015300171","doi":"https://doi.org/10.1109/icassp40776.2020.9053428","title":"Trilingual Semantic Embeddings of Visually Grounded Speech with Self-Attention Mechanisms","display_name":"Trilingual Semantic Embeddings of Visually Grounded Speech with Self-Attention Mechanisms","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3015300171","doi":"https://doi.org/10.1109/icassp40776.2020.9053428","mag":"3015300171"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9053428","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053428","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062509967","display_name":"Yasunori Ohishi","orcid":"https://orcid.org/0000-0002-7856-248X"},"institutions":[{"id":"https://openalex.org/I4210105847","display_name":"NTT Basic Research Laboratories","ror":"https://ror.org/01m2pas06","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210105847"]},{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yasunori Ohishi","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","institution_ids":["https://openalex.org/I4210105847","https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102267134","display_name":"Akisato Kimura","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]},{"id":"https://openalex.org/I4210105847","display_name":"NTT Basic Research Laboratories","ror":"https://ror.org/01m2pas06","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210105847"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Akisato Kimura","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","institution_ids":["https://openalex.org/I4210105847","https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029349765","display_name":"Takahito Kawanishi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210105847","display_name":"NTT Basic Research Laboratories","ror":"https://ror.org/01m2pas06","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210105847"]},{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takahito Kawanishi","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","institution_ids":["https://openalex.org/I4210105847","https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061465935","display_name":"Kunio Kashino","orcid":null},"institutions":[{"id":"https://openalex.org/I4210105847","display_name":"NTT Basic Research Laboratories","ror":"https://ror.org/01m2pas06","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210105847"]},{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kunio Kashino","raw_affiliation_strings":["NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,NTT Communication Science Laboratories,Atsugi,Kanagawa,Japan","institution_ids":["https://openalex.org/I4210105847","https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Communication Science Laboratories, NTT Corporation, Atsugi, Kanagawa, Japan","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004717608","display_name":"David Harwath","orcid":"https://orcid.org/0000-0003-0206-0253"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"David Harwath","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA","MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA","institution_ids":["https://openalex.org/I4210164862"]},{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112758056","display_name":"James Glass","orcid":"https://orcid.org/0000-0002-3097-360X"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"James Glass","raw_affiliation_strings":["MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA","MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA, USA"],"affiliations":[{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory,Cambridge,MA,USA","institution_ids":["https://openalex.org/I4210164862"]},{"raw_affiliation_string":"MIT Computer Science and Artificial Intelligence Laboratory, Cambridge, MA, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5062509967"],"corresponding_institution_ids":["https://openalex.org/I2251713219","https://openalex.org/I4210105847"],"apc_list":null,"apc_paid":null,"fwci":2.0517,"has_fulltext":false,"cited_by_count":22,"citation_normalized_percentile":{"value":0.8889225,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4352","last_page":"4356"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8261035084724426},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7251431345939636},{"id":"https://openalex.org/keywords/hindi","display_name":"Hindi","score":0.710393488407135},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6607551574707031},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5434364080429077},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.49639660120010376},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4542262852191925},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4469816982746124},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.27129846811294556}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8261035084724426},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7251431345939636},{"id":"https://openalex.org/C519982507","wikidata":"https://www.wikidata.org/wiki/Q1568","display_name":"Hindi","level":2,"score":0.710393488407135},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6607551574707031},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5434364080429077},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.49639660120010376},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4542262852191925},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4469816982746124},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.27129846811294556},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp40776.2020.9053428","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9053428","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.8100000023841858,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":51,"referenced_works":["https://openalex.org/W30845872","https://openalex.org/W1686810756","https://openalex.org/W1889081078","https://openalex.org/W2102605133","https://openalex.org/W2112912048","https://openalex.org/W2114347655","https://openalex.org/W2134670479","https://openalex.org/W2295297373","https://openalex.org/W2556930864","https://openalex.org/W2573834658","https://openalex.org/W2586148577","https://openalex.org/W2611029872","https://openalex.org/W2796315435","https://openalex.org/W2804078698","https://openalex.org/W2886641317","https://openalex.org/W2899463613","https://openalex.org/W2906407728","https://openalex.org/W2920166246","https://openalex.org/W2938991416","https://openalex.org/W2962732076","https://openalex.org/W2962753610","https://openalex.org/W2962756039","https://openalex.org/W2962835968","https://openalex.org/W2962862718","https://openalex.org/W2962960500","https://openalex.org/W2962978519","https://openalex.org/W2963115079","https://openalex.org/W2963330681","https://openalex.org/W2963525826","https://openalex.org/W2963778889","https://openalex.org/W2963902314","https://openalex.org/W2964001192","https://openalex.org/W2964115348","https://openalex.org/W2964249784","https://openalex.org/W2964350391","https://openalex.org/W2965538726","https://openalex.org/W2972345028","https://openalex.org/W2972892814","https://openalex.org/W2988907666","https://openalex.org/W4206865574","https://openalex.org/W4293665662","https://openalex.org/W6637373629","https://openalex.org/W6639432524","https://openalex.org/W6675026286","https://openalex.org/W6676647902","https://openalex.org/W6679792166","https://openalex.org/W6729831399","https://openalex.org/W6729977899","https://openalex.org/W6750651883","https://openalex.org/W6752378368","https://openalex.org/W6761029826"],"related_works":["https://openalex.org/W2096720918","https://openalex.org/W3169305685","https://openalex.org/W3152759877","https://openalex.org/W2949267551","https://openalex.org/W2788784374","https://openalex.org/W2565299779","https://openalex.org/W4297801177","https://openalex.org/W2998419962","https://openalex.org/W3154878020","https://openalex.org/W2963667932"],"abstract_inverted_index":{"We":[0,102],"propose":[1],"a":[2,55,81,123],"trilingual":[3],"semantic":[4],"embedding":[5,100],"model":[6,31,86,146],"that":[7,104,119,139],"associates":[8],"visual":[9],"objects":[10],"in":[11,22,130],"images":[12],"with":[13,94],"segments":[14],"of":[15,58,122,132],"speech":[16,60],"signals":[17],"corresponding":[18],"to":[19,87,144],"spoken":[20,63,91],"words":[21],"an":[23],"unsupervised":[24],"manner.":[25],"Unlike":[26],"the":[27,43,47,85,90,95,99,105,120,127,140,145],"existing":[28,48],"models,":[29],"our":[30],"incorporates":[32],"three":[33],"different":[34],"languages,":[35],"namely,":[36],"English,":[37],"Hindi,":[38],"and":[39,50,53,134,138],"Japanese.":[40],"To":[41],"build":[42],"model,":[44],"we":[45,79],"used":[46],"English":[49],"Hindi":[51],"datasets":[52],"collected":[54],"new":[56],"corpus":[57],"Japanese":[59],"captions.":[61],"These":[62],"captions":[64,92],"are":[65],"spontaneous":[66],"descriptions":[67],"by":[68],"individual":[69],"speakers,":[70],"rather":[71],"than":[72],"readings":[73],"based":[74],"on":[75],"prepared":[76],"transcripts.":[77],"Therefore,":[78],"introduce":[80],"self-attention":[82,106,141],"mechanism":[83,107,142],"into":[84,98],"better":[88],"map":[89],"associated":[93],"same":[96],"image":[97],"space.":[101],"hope":[103],"efficiently":[108],"captures":[109],"relationships":[110],"between":[111],"widely":[112],"separated":[113],"word-like":[114],"segments.":[115],"Experimental":[116],"results":[117],"show":[118],"introduction":[121],"third":[124],"language":[125],"improves":[126],"average":[128],"performance":[129],"terms":[131],"cross-modal":[133],"cross-lingual":[135],"retrieval":[136],"accuracy,":[137],"added":[143],"works":[147],"effectively.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":8},{"year":2020,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
