{"id":"https://openalex.org/W4385823193","doi":"https://doi.org/10.21437/interspeech.2023-780","title":"Incorporating Ultrasound Tongue Images for Audio-Visual Speech Enhancement through Knowledge Distillation","display_name":"Incorporating Ultrasound Tongue Images for Audio-Visual Speech Enhancement through Knowledge Distillation","publication_year":2023,"publication_date":"2023-08-14","ids":{"openalex":"https://openalex.org/W4385823193","doi":"https://doi.org/10.21437/interspeech.2023-780"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2023-780","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-780","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5066498315","display_name":"Rui-Chen Zheng","orcid":"https://orcid.org/0009-0000-8074-9553"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Rui-Chen Zheng","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014746276","display_name":"Yang Ai","orcid":"https://orcid.org/0009-0006-0157-4980"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Ai","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5059767940","display_name":"Zhen-Hua Ling","orcid":"https://orcid.org/0000-0001-7853-5273"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen-Hua Ling","raw_affiliation_strings":["National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China"],"affiliations":[{"raw_affiliation_string":"National Engineering Research Center of Speech and Language Information Processing, University of Science and Technology of China, Hefei, P. R. China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5066498315"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":1.2226,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.79805947,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"844","last_page":"848"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9778000116348267,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9778000116348267,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7773497104644775},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.6008216142654419},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5394943356513977},{"id":"https://openalex.org/keywords/tongue","display_name":"Tongue","score":0.49498695135116577},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.45605039596557617},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.37024790048599243},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.33421650528907776}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7773497104644775},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.6008216142654419},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5394943356513977},{"id":"https://openalex.org/C2779744641","wikidata":"https://www.wikidata.org/wiki/Q9614","display_name":"Tongue","level":2,"score":0.49498695135116577},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.45605039596557617},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37024790048599243},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.33421650528907776},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2023-780","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2023-780","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"INTERSPEECH 2023","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.46000000834465027,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W1987337335","https://openalex.org/W1995440941","https://openalex.org/W2042407743","https://openalex.org/W2044893557","https://openalex.org/W2089240210","https://openalex.org/W2096779346","https://openalex.org/W2137400100","https://openalex.org/W2747874407","https://openalex.org/W2801554275","https://openalex.org/W2889413431","https://openalex.org/W2962780374","https://openalex.org/W2962946126","https://openalex.org/W2963082324","https://openalex.org/W2964171275","https://openalex.org/W2979074722","https://openalex.org/W2982242214","https://openalex.org/W3097945073","https://openalex.org/W3121016465","https://openalex.org/W3136499730","https://openalex.org/W3143787022","https://openalex.org/W3174954176","https://openalex.org/W3196749493","https://openalex.org/W4232282348","https://openalex.org/W4283782204","https://openalex.org/W4286856928","https://openalex.org/W4287392530","https://openalex.org/W4296068808","https://openalex.org/W4300860529"],"related_works":["https://openalex.org/W2355862304","https://openalex.org/W2356108042","https://openalex.org/W2030250808","https://openalex.org/W2376796979","https://openalex.org/W2379418341","https://openalex.org/W2380054981","https://openalex.org/W2393110101","https://openalex.org/W2379285345","https://openalex.org/W2331065455","https://openalex.org/W2372054075"],"abstract_inverted_index":{"Audio-visual":[0],"speech":[1,8,28,64,74,89,100,111],"enhancement":[2,65,75,101],"(AV-SE)":[3],"aims":[4],"to":[5,22,37,50,68,96],"enhance":[6],"degraded":[7],"along":[9],"with":[10],"extra":[11],"visual":[12],"information":[13],"such":[14],"as":[15],"lip":[16],"videos,":[17],"and":[18,85,117],"has":[19],"been":[20],"shown":[21],"be":[23],"more":[24],"effective":[25],"than":[26],"audio-only":[27],"enhancement.This":[29],"paper":[30],"proposes":[31],"further":[32],"incorporating":[33],"ultrasound":[34,56,126],"tongue":[35,57,127],"images":[36,58],"improve":[38],"lip-based":[39],"AV-SE":[40],"systems'":[41],"performance.Knowledge":[42],"distillation":[43],"is":[44],"employed":[45],"at":[46],"the":[47,52,83,88,92,97,123],"training":[48],"stage":[49],"address":[51],"challenge":[53],"of":[54,87,109,125],"acquiring":[55],"during":[59],"inference,":[60],"enabling":[61],"an":[62],"audio-lip":[63,99],"student":[66],"model":[67],"learn":[69],"from":[70,122],"a":[71],"pre-trained":[72],"audiolip-tongue":[73],"teacher":[76],"model.Experimental":[77],"results":[78],"demonstrate":[79],"significant":[80],"improvements":[81],"in":[82],"quality":[84],"intelligibility":[86],"enhanced":[90],"by":[91],"proposed":[93],"method":[94],"compared":[95],"traditional":[98],"baselines.Further":[102],"analysis":[103],"using":[104],"phone":[105],"error":[106],"rates":[107],"(PER)":[108],"automatic":[110],"recognition":[112],"(ASR)":[113],"shows":[114],"that":[115],"palatal":[116],"velar":[118],"consonants":[119],"benefit":[120],"most":[121],"introduction":[124],"images.":[128]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
