{"id":"https://openalex.org/W4372347650","doi":"https://doi.org/10.1109/icassp49357.2023.10096699","title":"Leveraging Phone-Level Linguistic-Acoustic Similarity For Utterance-Level Pronunciation Scoring","display_name":"Leveraging Phone-Level Linguistic-Acoustic Similarity For Utterance-Level Pronunciation Scoring","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372347650","doi":"https://doi.org/10.1109/icassp49357.2023.10096699"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096699","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096699","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100763568","display_name":"Wei Liu","orcid":"https://orcid.org/0000-0001-6351-9019"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wei Liu","raw_affiliation_strings":["The Chinese University of Hong Kong,Department of Electronic Engineering","Department of Electronic Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,Department of Electronic Engineering","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Electronic Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074965408","display_name":"Kaiqi Fu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kaiqi Fu","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100959470","display_name":"Xiaohai Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohai Tian","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006589840","display_name":"Shuju Shi","orcid":"https://orcid.org/0000-0002-8349-1145"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shuju Shi","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100318020","display_name":"Wei Li","orcid":"https://orcid.org/0000-0001-7824-4839"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei Li","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110851569","display_name":"Zejun Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zejun Ma","raw_affiliation_strings":["ByteDance"],"affiliations":[{"raw_affiliation_string":"ByteDance","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5001795601","display_name":"Tan Lee","orcid":"https://orcid.org/0000-0002-7089-3436"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"CN","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tan Lee","raw_affiliation_strings":["The Chinese University of Hong Kong,Department of Electronic Engineering","Department of Electronic Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong,Department of Electronic Engineering","institution_ids":["https://openalex.org/I177725633"]},{"raw_affiliation_string":"Department of Electronic Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100763568"],"corresponding_institution_ids":["https://openalex.org/I177725633"],"apc_list":null,"apc_paid":null,"fwci":1.0468,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.80775567,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.8873171210289001},{"id":"https://openalex.org/keywords/phone","display_name":"Phone","score":0.7895374298095703},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7722967863082886},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.622870683670044},{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.5522072315216064},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.5434825420379639},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.4873736500740051},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4605312645435333},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4564644396305084},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4392927587032318},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.2880982756614685},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1866910755634308}],"concepts":[{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.8873171210289001},{"id":"https://openalex.org/C2778707766","wikidata":"https://www.wikidata.org/wiki/Q202064","display_name":"Phone","level":2,"score":0.7895374298095703},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7722967863082886},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.622870683670044},{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.5522072315216064},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.5434825420379639},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.4873736500740051},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4605312645435333},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4564644396305084},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4392927587032318},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2880982756614685},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1866910755634308},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096699","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096699","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.800000011920929,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1522301498","https://openalex.org/W2067517679","https://openalex.org/W2091856355","https://openalex.org/W2134124280","https://openalex.org/W2139008940","https://openalex.org/W2295721380","https://openalex.org/W2397168380","https://openalex.org/W2419604081","https://openalex.org/W2575689684","https://openalex.org/W2745877834","https://openalex.org/W2768986418","https://openalex.org/W2770494505","https://openalex.org/W2963308316","https://openalex.org/W2972347929","https://openalex.org/W3096544436","https://openalex.org/W3096674206","https://openalex.org/W3123665557","https://openalex.org/W3196891430","https://openalex.org/W3197742413","https://openalex.org/W3197938691","https://openalex.org/W3198060681","https://openalex.org/W4221151578","https://openalex.org/W4224861836","https://openalex.org/W4224928163","https://openalex.org/W4299785905","https://openalex.org/W4312069033","https://openalex.org/W4385245566","https://openalex.org/W6640631609","https://openalex.org/W6739901393","https://openalex.org/W7038589486"],"related_works":["https://openalex.org/W2183593636","https://openalex.org/W2529301793","https://openalex.org/W2350724007","https://openalex.org/W2384121599","https://openalex.org/W2355751417","https://openalex.org/W2038083449","https://openalex.org/W2423284978","https://openalex.org/W2083922162","https://openalex.org/W2000075989","https://openalex.org/W4220683390"],"abstract_inverted_index":{"Recent":[0],"studies":[1],"on":[2,139],"pronunciation":[3,32,40,65,90,113,184],"scoring":[4],"have":[5],"explored":[6],"the":[7,34,38,55,68,74,105,133,140,145,150,153,168,173],"effect":[8],"of":[9,26,33,57,89,104,121,182],"introducing":[10],"phone":[11,28,36,79,122,156,169],"embeddings":[12,125,157,170],"as":[13,37,185],"reference":[14,27,63,78],"pronunciation,":[15],"but":[16],"mostly":[17],"in":[18,172],"an":[19],"implicit":[20],"manner,":[21],"i.e.,":[22],"addition":[23],"or":[24,161],"concatenation":[25],"embedding":[29,80],"and":[30,81,155],"actual":[31],"target":[35],"phone-level":[39,87],"quality":[41],"representation.":[42],"In":[43],"this":[44,98],"paper,":[45],"we":[46],"propose":[47],"to":[48,52,96,117,131,178],"use":[49],"linguistic-acoustic":[50,180],"similarity":[51,76,129],"explicitly":[53],"measure":[54],"deviation":[56,69],"non-native":[58,141],"production":[59],"from":[60],"its":[61],"native":[62,183],"for":[64,101],"assessment.":[66],"Specifically,":[67],"is":[70,94,115],"first":[71],"estimated":[72],"by":[73],"cosine":[75],"between":[77],"corresponding":[82],"acoustic":[83,124,154],"embedding.":[84],"Next,":[85],"a":[86,110,119],"Goodness":[88],"(GOP)":[91],"pre-training":[92],"stage":[93],"introduced":[95],"guide":[97],"similarity-based":[99],"learning":[100],"better":[102],"initialization":[103],"aforementioned":[106],"two":[107],"embeddings.":[108],"Finally,":[109],"transformer-based":[111],"hierarchical":[112],"scorer":[114],"used":[116],"map":[118],"sequence":[120],"embeddings,":[123],"along":[126],"with":[127],"their":[128],"measures":[130],"predict":[132],"final":[134],"utterance-level":[135],"score.":[136],"Experimental":[137],"results":[138],"databases":[142],"suggest":[143],"that":[144,167],"proposed":[146,174],"system":[147],"significantly":[148],"outperforms":[149],"baselines,":[151],"where":[152],"are":[158,176],"simply":[159],"added":[160],"concatenated.":[162],"A":[163],"further":[164],"examination":[165],"shows":[166],"learned":[171],"approach":[175],"able":[177],"capture":[179],"attributes":[181],"references.":[186]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2023,"cited_by_count":3}],"updated_date":"2026-04-01T17:29:45.350535","created_date":"2025-10-10T00:00:00"}
