{"id":"https://openalex.org/W4416799443","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249303","title":"Speech Intelligibility Assessment with Uncertainty-Aware Whisper Embeddings and sLSTM","display_name":"Speech Intelligibility Assessment with Uncertainty-Aware Whisper Embeddings and sLSTM","publication_year":2025,"publication_date":"2025-10-22","ids":{"openalex":"https://openalex.org/W4416799443","doi":"https://doi.org/10.1109/apsipaasc65261.2025.11249303"},"language":null,"primary_location":{"id":"doi:10.1109/apsipaasc65261.2025.11249303","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249303","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030281426","display_name":"Ryandhimas E. Zezario","orcid":"https://orcid.org/0000-0001-7319-8263"},"institutions":[{"id":"https://openalex.org/I4210086894","display_name":"Research Center for Information Technology Innovation, Academia Sinica","ror":"https://ror.org/000zgvm20","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210086894","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":true,"raw_author_name":"Ryandhimas E. Zezario","raw_affiliation_strings":["Academia Sinica,Research Center for Information Technology Innovation,Taipei,Taiwan"],"affiliations":[{"raw_affiliation_string":"Academia Sinica,Research Center for Information Technology Innovation,Taipei,Taiwan","institution_ids":["https://openalex.org/I4210086894"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093650148","display_name":"Dyah A. M. G. Wisnu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Dyah A.M.G. Wisnu","raw_affiliation_strings":["Institute of Information Science, Academia Sinica,Social Network and Human Centered Computing,Taipei,Taiwan"],"affiliations":[{"raw_affiliation_string":"Institute of Information Science, Academia Sinica,Social Network and Human Centered Computing,Taipei,Taiwan","institution_ids":["https://openalex.org/I4210098366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071214181","display_name":"Hsin\u2010Min Wang","orcid":"https://orcid.org/0000-0003-3599-5071"},"institutions":[{"id":"https://openalex.org/I4210098366","display_name":"Institute of Information Science, Academia Sinica","ror":"https://ror.org/00z83z196","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210098366","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Hsin-Min Wang","raw_affiliation_strings":["Institute of Information Science, Academia Sinica,Taipei,Taiwan"],"affiliations":[{"raw_affiliation_string":"Institute of Information Science, Academia Sinica,Taipei,Taiwan","institution_ids":["https://openalex.org/I4210098366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5044008055","display_name":"Yu Tsao","orcid":"https://orcid.org/0000-0001-6956-0418"},"institutions":[{"id":"https://openalex.org/I4210086894","display_name":"Research Center for Information Technology Innovation, Academia Sinica","ror":"https://ror.org/000zgvm20","country_code":"TW","type":"facility","lineage":["https://openalex.org/I4210086894","https://openalex.org/I84653119"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Yu Tsao","raw_affiliation_strings":["Academia Sinica,Research Center for Information Technology Innovation,Taipei,Taiwan"],"affiliations":[{"raw_affiliation_string":"Academia Sinica,Research Center for Information Technology Innovation,Taipei,Taiwan","institution_ids":["https://openalex.org/I4210086894"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5030281426"],"corresponding_institution_ids":["https://openalex.org/I4210086894"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.48029182,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1057","last_page":"1061"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.45010000467300415,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.45010000467300415,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2624000012874603,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.14839999377727509,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.8450999855995178},{"id":"https://openalex.org/keywords/intelligibility","display_name":"Intelligibility (philosophy)","score":0.6929000020027161},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5773000121116638},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.46470001339912415},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.43779999017715454},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.4348999857902527},{"id":"https://openalex.org/keywords/cross-entropy","display_name":"Cross entropy","score":0.3564000129699707},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.3492000102996826}],"concepts":[{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.8450999855995178},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7264999747276306},{"id":"https://openalex.org/C60048801","wikidata":"https://www.wikidata.org/wiki/Q1433889","display_name":"Intelligibility (philosophy)","level":2,"score":0.6929000020027161},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.605400025844574},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5773000121116638},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5184000134468079},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.46470001339912415},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.43779999017715454},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.4348999857902527},{"id":"https://openalex.org/C167981619","wikidata":"https://www.wikidata.org/wiki/Q1685498","display_name":"Cross entropy","level":3,"score":0.3564000129699707},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.3492000102996826},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.3370000123977661},{"id":"https://openalex.org/C33724603","wikidata":"https://www.wikidata.org/wiki/Q812540","display_name":"Bayesian network","level":2,"score":0.3330000042915344},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33079999685287476},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31439998745918274},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.3018999993801117},{"id":"https://openalex.org/C2777462759","wikidata":"https://www.wikidata.org/wiki/Q18395344","display_name":"Word embedding","level":3,"score":0.30140000581741333},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.29789999127388},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C147168706","wikidata":"https://www.wikidata.org/wiki/Q1457734","display_name":"Recurrent neural network","level":3,"score":0.2879999876022339},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.2547999918460846}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/apsipaasc65261.2025.11249303","is_oa":false,"landing_page_url":"https://doi.org/10.1109/apsipaasc65261.2025.11249303","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W1582976041","https://openalex.org/W1985029311","https://openalex.org/W1987831012","https://openalex.org/W2078483536","https://openalex.org/W2121973264","https://openalex.org/W2141998673","https://openalex.org/W2809874909","https://openalex.org/W2963828919","https://openalex.org/W2964052309","https://openalex.org/W3016070314","https://openalex.org/W3016129867","https://openalex.org/W3032727804","https://openalex.org/W3209059054","https://openalex.org/W4221143458","https://openalex.org/W4223595754","https://openalex.org/W4226198268","https://openalex.org/W4245919820","https://openalex.org/W4297841451","https://openalex.org/W4297841568","https://openalex.org/W4311167834","https://openalex.org/W4387363548","https://openalex.org/W4391021560","https://openalex.org/W4392903985","https://openalex.org/W4392904968","https://openalex.org/W4402981815","https://openalex.org/W4404712190"],"related_works":[],"abstract_inverted_index":{"Non-intrusive":[0],"speech":[1],"intelligibility":[2,98,119],"prediction":[3,99],"remains":[4],"challenging":[5],"due":[6],"to":[7],"variability":[8],"in":[9,25],"speakers,":[10],"noise":[11],"conditions,":[12],"and":[13,34,64,107,121,130,152],"subjective":[14],"perception.":[15],"We":[16],"propose":[17,93],"an":[18,95],"uncertainty-aware":[19,150],"approach":[20],"that":[21,101,135],"leverages":[22],"Whisper":[23],"embeddings":[24],"combination":[26],"with":[27],"statistical":[28],"features\u2014specifically,":[29],"the":[30,38,48,62,69,138,146,153],"mean,":[31],"standard":[32,65],"deviation,":[33],"entropy":[35],"computed":[36,43],"across":[37,141],"embedding":[39],"dimensions.":[40],"The":[41],"entropy,":[42],"via":[44],"a":[45,53,76,111],"softmax":[46],"over":[47],"feature":[49],"dimension,":[50],"serves":[51],"as":[52],"proxy":[54],"for":[55],"uncertainty,":[56],"complementing":[57],"global":[58],"information":[59],"captured":[60],"by":[61],"mean":[63],"deviation.":[66],"To":[67],"model":[68],"sequential":[70],"structure":[71],"of":[72,148],"speech,":[73],"we":[74,92],"adopt":[75],"scalar":[77],"long":[78],"short-term":[79],"memory":[80],"(sLSTM)":[81],"network,":[82],"which":[83],"efficiently":[84],"captures":[85],"long-range":[86],"dependencies.":[87],"Building":[88],"on":[89],"this":[90],"foundation,":[91],"iMTI-Net,":[94],"improved":[96],"multi-target":[97],"network":[100,105],"integrates":[102],"convolutional":[103],"neural":[104],"(CNN)":[106],"sLSTM":[108],"components":[109],"within":[110],"multitask":[112],"learning":[113],"framework.":[114],"It":[115],"jointly":[116],"predicts":[117],"human":[118],"scores":[120],"machine-based":[122],"word":[123],"error":[124],"rates":[125],"(WER)":[126],"from":[127],"Google":[128],"ASR":[129],"Whisper.":[131],"Experimental":[132],"results":[133],"show":[134],"iMTI-Net":[136],"outperforms":[137],"original":[139],"MTI-Net":[140],"multiple":[142],"evaluation":[143],"metrics,":[144],"demonstrating":[145],"effectiveness":[147],"incorporating":[149],"features":[151],"CNN-sLSTM":[154],"architecture.":[155]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-28T00:00:00"}
