{"id":"https://openalex.org/W4390120219","doi":"https://doi.org/10.1109/jbhi.2023.3345897","title":"Improving Medical Speech-to-Text Accuracy using Vision-Language Pre-training Models","display_name":"Improving Medical Speech-to-Text Accuracy using Vision-Language Pre-training Models","publication_year":2023,"publication_date":"2023-12-22","ids":{"openalex":"https://openalex.org/W4390120219","doi":"https://doi.org/10.1109/jbhi.2023.3345897","pmid":"https://pubmed.ncbi.nlm.nih.gov/38133977"},"language":"en","primary_location":{"id":"doi:10.1109/jbhi.2023.3345897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jbhi.2023.3345897","pdf_url":null,"source":{"id":"https://openalex.org/S2495854775","display_name":"IEEE Journal of Biomedical and Health Informatics","issn_l":"2168-2194","issn":["2168-2194","2168-2208"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Biomedical and Health Informatics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083221978","display_name":"Jaeyoung Huh","orcid":"https://orcid.org/0000-0002-2126-0763"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]},{"id":"https://openalex.org/I4210101891","display_name":"Korea Institute of Brain Science","ror":"https://ror.org/017stnw60","country_code":"KR","type":"facility","lineage":["https://openalex.org/I4210101891"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Jaeyoung Huh","raw_affiliation_strings":["Department of Bio and Brain Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Department of Bio and Brain Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea","institution_ids":["https://openalex.org/I4210101891","https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057920247","display_name":"Sang Joon Park","orcid":"https://orcid.org/0000-0002-9223-3172"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]},{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]},{"id":"https://openalex.org/I4210101891","display_name":"Korea Institute of Brain Science","ror":"https://ror.org/017stnw60","country_code":"KR","type":"facility","lineage":["https://openalex.org/I4210101891"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Sangjoon Park","raw_affiliation_strings":["Department of Radiation Oncology, Yonsei College of Medicine, Seodaemun-gu, South Korea","Department of Bio and Brain Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Department of Radiation Oncology, Yonsei College of Medicine, Seodaemun-gu, South Korea","institution_ids":["https://openalex.org/I193775966"]},{"raw_affiliation_string":"Department of Bio and Brain Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea","institution_ids":["https://openalex.org/I4210101891","https://openalex.org/I157485424"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100351241","display_name":"Jeong Eun Lee","orcid":"https://orcid.org/0000-0001-9566-7489"},"institutions":[{"id":"https://openalex.org/I4210143602","display_name":"Chungnam National University Hospital","ror":"https://ror.org/04353mq94","country_code":"KR","type":"healthcare","lineage":["https://openalex.org/I196345858","https://openalex.org/I4210143602"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jeong Eun Lee","raw_affiliation_strings":["Department of Radiology, Chungnam National University Hospital, Chungnam National University College of Medicine, Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Department of Radiology, Chungnam National University Hospital, Chungnam National University College of Medicine, Daejeon, South Korea","institution_ids":["https://openalex.org/I4210143602"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5012644755","display_name":"Jong Chul Ye","orcid":"https://orcid.org/0000-0001-9763-9609"},"institutions":[{"id":"https://openalex.org/I157485424","display_name":"Korea Advanced Institute of Science and Technology","ror":"https://ror.org/05apxxy63","country_code":"KR","type":"education","lineage":["https://openalex.org/I157485424"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jong Chul Ye","raw_affiliation_strings":["Department of Kim Jaechul Graduate School of AI, Korea Advanced Institute of Science and Technology, Daejeon, South Korea"],"affiliations":[{"raw_affiliation_string":"Department of Kim Jaechul Graduate School of AI, Korea Advanced Institute of Science and Technology, Daejeon, South Korea","institution_ids":["https://openalex.org/I157485424"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5083221978"],"corresponding_institution_ids":["https://openalex.org/I157485424","https://openalex.org/I4210101891"],"apc_list":null,"apc_paid":null,"fwci":1.5533,"has_fulltext":false,"cited_by_count":13,"citation_normalized_percentile":{"value":0.85668981,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"28","issue":"3","first_page":"1692","last_page":"1703"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8226600289344788},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5815609693527222},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5746470093727112},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.5723589062690735},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5719041228294373},{"id":"https://openalex.org/keywords/field","display_name":"Field (mathematics)","score":0.569711446762085},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5569084286689758},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5302585959434509},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.4272882044315338},{"id":"https://openalex.org/keywords/text-recognition","display_name":"Text recognition","score":0.41452106833457947},{"id":"https://openalex.org/keywords/text-simplification","display_name":"Text simplification","score":0.4100217819213867},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.2964435815811157}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8226600289344788},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5815609693527222},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5746470093727112},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.5723589062690735},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5719041228294373},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.569711446762085},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5569084286689758},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5302585959434509},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.4272882044315338},{"id":"https://openalex.org/C2983812711","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text recognition","level":3,"score":0.41452106833457947},{"id":"https://openalex.org/C59415355","wikidata":"https://www.wikidata.org/wiki/Q3484781","display_name":"Text simplification","level":3,"score":0.4100217819213867},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2964435815811157},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D006801","descriptor_name":"Humans","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D007802","descriptor_name":"Language","qualifier_ui":null,"qualifier_name":null,"is_major_topic":false},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D013060","descriptor_name":"Speech","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D014831","descriptor_name":"Voice","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D014831","descriptor_name":"Voice","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D014831","descriptor_name":"Voice","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true},{"descriptor_ui":"D014831","descriptor_name":"Voice","qualifier_ui":null,"qualifier_name":null,"is_major_topic":true}],"locations_count":2,"locations":[{"id":"doi:10.1109/jbhi.2023.3345897","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jbhi.2023.3345897","pdf_url":null,"source":{"id":"https://openalex.org/S2495854775","display_name":"IEEE Journal of Biomedical and Health Informatics","issn_l":"2168-2194","issn":["2168-2194","2168-2208"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Biomedical and Health Informatics","raw_type":"journal-article"},{"id":"pmid:38133977","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/38133977","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE journal of biomedical and health informatics","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.800000011920929,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G3444300927","display_name":null,"funder_award_id":"RS-2023-00262527","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"},{"id":"https://openalex.org/G7865882150","display_name":null,"funder_award_id":"NRF-2020R1A2B5B03001980","funder_id":"https://openalex.org/F4320322120","funder_display_name":"National Research Foundation of Korea"}],"funders":[{"id":"https://openalex.org/F4320322120","display_name":"National Research Foundation of Korea","ror":"https://ror.org/013aysd81"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":57,"referenced_works":["https://openalex.org/W1517386993","https://openalex.org/W1922655562","https://openalex.org/W1956340063","https://openalex.org/W1995562189","https://openalex.org/W2003333103","https://openalex.org/W2125838338","https://openalex.org/W2142416747","https://openalex.org/W2143612262","https://openalex.org/W2147768505","https://openalex.org/W2160815625","https://openalex.org/W2394932179","https://openalex.org/W2520160253","https://openalex.org/W2778310824","https://openalex.org/W2886180730","https://openalex.org/W2892009249","https://openalex.org/W2904818793","https://openalex.org/W2953190524","https://openalex.org/W2962719052","https://openalex.org/W2963466845","https://openalex.org/W2968124245","https://openalex.org/W2973049979","https://openalex.org/W2973215447","https://openalex.org/W2995225687","https://openalex.org/W2998356391","https://openalex.org/W3046177490","https://openalex.org/W3090449556","https://openalex.org/W3095173472","https://openalex.org/W3097075707","https://openalex.org/W3097777922","https://openalex.org/W3164654615","https://openalex.org/W4206341317","https://openalex.org/W4283821415","https://openalex.org/W4285159554","https://openalex.org/W4287813795","https://openalex.org/W4300672471","https://openalex.org/W4385546024","https://openalex.org/W4388451090","https://openalex.org/W6631362777","https://openalex.org/W6640090968","https://openalex.org/W6678262379","https://openalex.org/W6682631176","https://openalex.org/W6687566353","https://openalex.org/W6726497184","https://openalex.org/W6727336983","https://openalex.org/W6747741389","https://openalex.org/W6748221168","https://openalex.org/W6757424787","https://openalex.org/W6761205521","https://openalex.org/W6766904570","https://openalex.org/W6767211374","https://openalex.org/W6767279747","https://openalex.org/W6775843921","https://openalex.org/W6780218876","https://openalex.org/W6791353385","https://openalex.org/W6798001135","https://openalex.org/W6798805250","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W2000785801","https://openalex.org/W986318368","https://openalex.org/W2384410913","https://openalex.org/W2352878646","https://openalex.org/W2004734601","https://openalex.org/W2130149817","https://openalex.org/W2990194547","https://openalex.org/W1480123525","https://openalex.org/W2620865396","https://openalex.org/W2152349655"],"abstract_inverted_index":{"Automatic":[0],"Speech":[1],"Recognition":[2],"(ASR)":[3],"is":[4,27,74],"a":[5,92,103],"technology":[6],"that":[7,97,131,151],"converts":[8],"spoken":[9,37],"words":[10,38],"into":[11,39],"text,":[12],"facilitating":[13],"interaction":[14],"between":[15],"humans":[16],"and":[17,83,117,137,156],"machines.":[18],"One":[19],"of":[20,25,54,80,102,154],"the":[21,42,47,52,71,78,99,108,132,145],"most":[22],"common":[23],"applications":[24],"ASR":[26],"Speech-To-Text":[28],"(STT)":[29],"technology,":[30],"which":[31],"simplifies":[32],"user":[33],"workflows":[34],"by":[35],"transcribing":[36],"text.":[40],"In":[41],"medical":[43,72,146],"field,":[44],"STT":[45,68,105,142],"has":[46],"potential":[48],"to":[49,60,77,120],"significantly":[50],"reduce":[51],"workload":[53],"clinicians":[55],"who":[56],"rely":[57],"on":[58,124],"typists":[59],"transcribe":[61],"their":[62],"voice":[63],"recordings.":[64],"However,":[65],"developing":[66],"an":[67],"model":[69],"for":[70],"domain":[73],"challenging":[75],"due":[76],"lack":[79],"sufficient":[81],"speech":[82],"text":[84,94,101,122,157,164],"datasets.":[85],"To":[86],"address":[87],"this":[88],"issue,":[89],"we":[90],"propose":[91],"medical-domain":[93],"correction":[95],"method":[96,134],"modifies":[98],"output":[100],"general":[104],"system":[106],"using":[107,162],"Vision":[109],"Language":[110],"Pre-training":[111],"(VLP)":[112],"method.":[113],"VLP":[114],"combines":[115],"textual":[116],"visual":[118],"information":[119,158],"correct":[121],"based":[123],"image":[125,155],"knowledge.":[126],"Our":[127],"extensive":[128],"experiments":[129],"demonstrate":[130],"proposed":[133],"offers":[135],"quantitatively":[136],"clinically":[138],"significant":[139],"improvements":[140],"in":[141,144],"performance":[143],"field.":[147],"We":[148],"further":[149],"show":[150],"multi-modal":[152],"understanding":[153,161],"outperforms":[159],"single-modal":[160],"only":[163],"information.":[165]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":2}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
