{"id":"https://openalex.org/W4403791377","doi":"https://doi.org/10.1145/3664647.3681665","title":"CIEASR:Contextual Image-Enhanced Automatic Speech Recognition for Improved Homophone Discrimination","display_name":"CIEASR:Contextual Image-Enhanced Automatic Speech Recognition for Improved Homophone Discrimination","publication_year":2024,"publication_date":"2024-10-26","ids":{"openalex":"https://openalex.org/W4403791377","doi":"https://doi.org/10.1145/3664647.3681665"},"language":"en","primary_location":{"id":"doi:10.1145/3664647.3681665","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681665","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3664647.3681665","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Ziyi Wang","orcid":"https://orcid.org/0009-0006-5863-303X"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ziyi Wang","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0006-5863-303X","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108356774","display_name":"Yiming Rong","orcid":"https://orcid.org/0000-0003-4661-5601"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiming Rong","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-4661-5601","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences &amp; School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102575213","display_name":"D. A. Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Deyang Jiang","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences &amp; School of Future Technology, University of Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-4471-6965","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences &amp; School of Future Technology, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015180455","display_name":"Haoran Wu","orcid":"https://orcid.org/0000-0003-0406-8881"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haoran Wu","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-0406-8881","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101400153","display_name":"Shiyu Zhou","orcid":"https://orcid.org/0000-0002-6889-0316"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiyu Zhou","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-6889-0316","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108642431","display_name":"Bo Xu","orcid":"https://orcid.org/0000-0002-1111-1529"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bo Xu","raw_affiliation_strings":["Institute of Automation, Chinese Academy of Sciences, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-1111-1529","affiliations":[{"raw_affiliation_string":"Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16330196,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"915","last_page":"924"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7649648189544678},{"id":"https://openalex.org/keywords/homophone","display_name":"Homophone","score":0.7476147413253784},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7195371985435486},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5071194171905518},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.4269675612449646},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.07644763588905334}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7649648189544678},{"id":"https://openalex.org/C160253069","wikidata":"https://www.wikidata.org/wiki/Q221079","display_name":"Homophone","level":2,"score":0.7476147413253784},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7195371985435486},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5071194171905518},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4269675612449646},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.07644763588905334},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3664647.3681665","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681665","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3664647.3681665","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3664647.3681665","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 32nd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Reduced inequalities","id":"https://metadata.un.org/sdg/10","score":0.6000000238418579}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W1974379374","https://openalex.org/W2002591263","https://openalex.org/W2586850765","https://openalex.org/W2593463961","https://openalex.org/W2886945201","https://openalex.org/W2962858109","https://openalex.org/W3004413500","https://openalex.org/W3015678833","https://openalex.org/W3016011581","https://openalex.org/W3035042697","https://openalex.org/W3081492798","https://openalex.org/W3101631197","https://openalex.org/W3148906368","https://openalex.org/W3160413945","https://openalex.org/W3162293946","https://openalex.org/W3185341429","https://openalex.org/W3209059054","https://openalex.org/W4225832925","https://openalex.org/W4226033575","https://openalex.org/W4233749039","https://openalex.org/W4307286264","https://openalex.org/W4312651322","https://openalex.org/W4382457407"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2358775429","https://openalex.org/W1567440624","https://openalex.org/W2364211260","https://openalex.org/W1977345729","https://openalex.org/W2105439214","https://openalex.org/W2079548008","https://openalex.org/W2030399736"],"abstract_inverted_index":{"Automatic":[0,91],"Speech":[1,92],"Recognition":[2],"(ASR)":[3],"models":[4,22],"pre-trained":[5,20,80],"on":[6,152,163],"large-scale":[7],"speech":[8,81,97],"datasets":[9],"have":[10,29,36],"achieved":[11],"significant":[12],"breakthroughs":[13],"compared":[14],"with":[15,74],"traditional":[16],"methods.":[17],"However,":[18],"mainstream":[19],"ASR":[21],"encounter":[23],"challenges":[24],"in":[25,54,70],"distinguishing":[26],"homophones,":[27],"which":[28],"close":[30],"or":[31],"identical":[32],"pronunciations.":[33],"Previous":[34],"studies":[35],"introduced":[37],"visual":[38],"auxiliary":[39],"cues":[40],"to":[41,78,113,136,167],"address":[42],"this":[43,84],"challenge,":[44],"yet":[45],"the":[46,59,62,79,125,137,158],"sophisticated":[47],"use":[48],"of":[49,66,140],"lip":[50],"movements":[51],"falls":[52],"short":[53],"correcting":[55],"homophone":[56,115],"errors.":[57,116],"On":[58],"other":[60],"hand,":[61],"fusion":[63,105],"and":[64,123,143,154],"utilization":[65],"scene":[67,108,133],"images":[68,109,134],"remain":[69],"an":[71],"exploratory":[72],"stage,":[73],"performance":[75],"still":[76],"inferior":[77],"model.":[82],"In":[83],"paper,":[85],"we":[86,121],"introduce":[87],"CIEASR":[88,148],"(Contextual":[89],"Image-Enhanced":[90],"Recognition),":[93],"a":[94,102],"novel":[95],"multimodal":[96],"recognition":[98,139],"model":[99],"that":[100,132],"incorporates":[101],"new":[103],"cue":[104],"method,":[106],"using":[107],"as":[110],"soft":[111],"prompts":[112],"correct":[114],"To":[117],"mitigate":[118],"data":[119],"scarcity,":[120],"refine":[122],"expand":[124],"VSDial":[126,153,164],"dataset":[127],"for":[128],"extensive":[129],"experiments,":[130],"illustrating":[131],"contribute":[135],"accurate":[138],"entity":[141],"nouns":[142],"personal":[144],"pronouns.":[145],"Our":[146],"proposed":[147],"achieves":[149],"state-of-the-art":[150],"results":[151],"Flickr8K,":[155],"significantly":[156],"reducing":[157],"Character":[159],"Error":[160],"Rate":[161],"(CER)":[162],"from":[165],"3.61%":[166],"0.92%.":[168]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
