{"id":"https://openalex.org/W4372260033","doi":"https://doi.org/10.1109/icassp49357.2023.10095539","title":"Visual Information Matters for ASR Error Correction","display_name":"Visual Information Matters for ASR Error Correction","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260033","doi":"https://doi.org/10.1109/icassp49357.2023.10095539"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095539","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095539","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090307001","display_name":"Vanya Bannihatti Kumar","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vanya Bannihatti Kumar","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046413480","display_name":"Shanbo Cheng","orcid":"https://orcid.org/0000-0002-6115-9483"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shanbo Cheng","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040389488","display_name":"Ningxin Peng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ningxin Peng","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100424305","display_name":"Yuchen Zhang","orcid":"https://orcid.org/0000-0001-8536-7175"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuchen Zhang","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5090307001"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3497,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.63036369,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9984999895095825,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.838262677192688},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6937402486801147},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5932583808898926},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5265555381774902},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5033196806907654},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4958373010158539},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.48256292939186096},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.44597533345222473},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4240036606788635},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4230212867259979},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3208279609680176}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.838262677192688},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6937402486801147},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5932583808898926},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5265555381774902},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5033196806907654},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4958373010158539},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.48256292939186096},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.44597533345222473},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4240036606788635},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4230212867259979},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3208279609680176},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095539","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095539","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2783776165","https://openalex.org/W2899274165","https://openalex.org/W2916997151","https://openalex.org/W2962824709","https://openalex.org/W3094502228","https://openalex.org/W3095918555","https://openalex.org/W3101648800","https://openalex.org/W3102017794","https://openalex.org/W3175809709","https://openalex.org/W3196473441","https://openalex.org/W3198694222","https://openalex.org/W4225166170","https://openalex.org/W4225268857","https://openalex.org/W4225323055","https://openalex.org/W4285241608","https://openalex.org/W4285284254","https://openalex.org/W4287173589","https://openalex.org/W4312638101","https://openalex.org/W4385245566","https://openalex.org/W6739901393","https://openalex.org/W6755559483","https://openalex.org/W6784333009","https://openalex.org/W6785749098","https://openalex.org/W6795952400","https://openalex.org/W6810334672"],"related_works":["https://openalex.org/W2378211422","https://openalex.org/W4321353415","https://openalex.org/W2745001401","https://openalex.org/W2185469136","https://openalex.org/W2130974462","https://openalex.org/W2028665553","https://openalex.org/W2086519370","https://openalex.org/W972276598","https://openalex.org/W4246352526","https://openalex.org/W2944691285"],"abstract_inverted_index":{"Aiming":[0],"to":[1,23,77,122,126,159],"improve":[2],"the":[3,44,91,102,139,150,183],"Automatic":[4],"Speech":[5],"Recognition":[6],"(ASR)":[7],"outputs":[8],"with":[9],"a":[10,94],"post-processing":[11],"step,":[12],"ASR":[13],"error":[14],"correction":[15],"(EC)":[16],"techniques":[17],"have":[18],"been":[19,85],"widely":[20],"developed":[21],"due":[22],"their":[24],"efficiency":[25],"in":[26,138,193,205],"using":[27,36,176],"parallel":[28],"text":[29,37,50,147],"data.":[30],"Previous":[31],"works":[32],"mainly":[33,68],"focus":[34],"on":[35],"or/":[38],"and":[39,51,117,146,149,186],"speech":[40,52],"data,":[41],"which":[42,197],"hinders":[43],"performance":[45],"gain":[46],"when":[47,167],"not":[48],"only":[49],"information,":[53,80,148],"but":[54],"other":[55,88],"modalities,":[56],"such":[57],"as":[58,120,178],"visual":[59,79,98,124,168,184,201],"information":[60,99,125,169,185,202],"are":[61,67,153],"critical":[62,204],"for":[63,101],"EC.":[64],"The":[65,87],"challenges":[66],"two":[69],"folds:":[70],"one":[71],"is":[72,89,170,203],"that":[73,90,161,175,200],"previous":[74],"work":[75],"fails":[76],"emphasize":[78],"thus":[81],"rare":[82],"exploration":[83],"has":[84],"studied.":[86],"community":[92],"lacks":[93],"high-quality":[95],"benchmark":[96,131],"where":[97,135],"matters":[100],"EC":[103],"models.":[104],"Therefore,":[105],"this":[106],"paper":[107],"provides":[108],"1)":[109],"simple":[110],"yet":[111],"effective":[112],"methods,":[113],"namely":[114,133],"gated":[115],"fusion":[116],"image":[118],"captions":[119,177],"prompts":[121,179],"incorporate":[123],"help":[127],"EC;":[128],"2)":[129],"large-scale":[130],"datasets,":[132],"Visual-ASR-EC,":[134],"each":[136],"item":[137],"training":[140],"data":[141,152],"consists":[142],"of":[143],"visual,":[144],"speech,":[145],"test":[151],"carefully":[154],"selected":[155],"by":[156,190],"human":[157],"annotators":[158],"ensure":[160],"even":[162],"humans":[163],"could":[164,180],"make":[165],"mistakes":[166],"missing.":[171],"Experimental":[172],"results":[173],"show":[174],"effectively":[181],"use":[182],"surpass":[187],"state-of-the-art":[188],"methods":[189],"upto":[191],"1.2%":[192],"Word":[194],"Error":[195],"Rate(WER),":[196],"also":[198],"indicates":[199],"our":[206],"proposed":[207],"Visual-ASR-EC":[208],"dataset.":[209]},"counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
