{"id":"https://openalex.org/W7161733663","doi":"https://doi.org/10.48550/arxiv.2605.16545","title":"Symphony for Speech-to-Text: Supporting Real-Time Medical Voice Interfaces","display_name":"Symphony for Speech-to-Text: Supporting Real-Time Medical Voice Interfaces","publication_year":2026,"publication_date":"2026-05-15","ids":{"openalex":"https://openalex.org/W7161733663","doi":"https://doi.org/10.48550/arxiv.2605.16545"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.16545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.16545","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133331217","display_name":"Arne Nix","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nix, Arne","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136468756","display_name":"Robert James","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"James, Robert","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018340002","display_name":"Lasse Borgholt","orcid":"https://orcid.org/0000-0002-3562-8442"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Borgholt, Lasse","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136470668","display_name":"Anna B. Ekner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ekner, Anna B.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136480405","display_name":"Lana Krumm","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krumm, Lana","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009964346","display_name":"Julius Severin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Severin, Julius","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029719157","display_name":"Dan Engel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Engel, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007556794","display_name":"Lars Maal\u00f8e","orcid":"https://orcid.org/0000-0003-2030-520X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maal\u00f8e, Lars","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134208508","display_name":"Jakob Havtorn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Havtorn, Jakob","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11894","display_name":"Radiology practices and education","score":0.13650000095367432,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11894","display_name":"Radiology practices and education","score":0.13650000095367432,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12419","display_name":"Phonocardiography and Auscultation Techniques","score":0.13289999961853027,"subfield":{"id":"https://openalex.org/subfields/2740","display_name":"Pulmonary and Respiratory Medicine"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.10660000145435333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dictation","display_name":"Dictation","score":0.8758999705314636},{"id":"https://openalex.org/keywords/symphony","display_name":"Symphony","score":0.8052999973297119},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.5884000062942505},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.45509999990463257},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.4343999922275543},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4341000020503998},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.37299999594688416},{"id":"https://openalex.org/keywords/chord","display_name":"Chord (peer-to-peer)","score":0.36719998717308044}],"concepts":[{"id":"https://openalex.org/C2779077324","wikidata":"https://www.wikidata.org/wiki/Q1087138","display_name":"Dictation","level":2,"score":0.8758999705314636},{"id":"https://openalex.org/C16277566","wikidata":"https://www.wikidata.org/wiki/Q9734","display_name":"Symphony","level":2,"score":0.8052999973297119},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6499999761581421},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6229000091552734},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.5884000062942505},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.45509999990463257},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.4343999922275543},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4341000020503998},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.37299999594688416},{"id":"https://openalex.org/C194147245","wikidata":"https://www.wikidata.org/wiki/Q1076368","display_name":"Chord (peer-to-peer)","level":2,"score":0.36719998717308044},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3450999855995178},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.34299999475479126},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.33149999380111694},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.32260000705718994},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.31310001015663147},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C2777413886","wikidata":"https://www.wikidata.org/wiki/Q3276013","display_name":"Fluency","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3001999855041504},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C100660578","wikidata":"https://www.wikidata.org/wiki/Q18733","display_name":"Recall","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2619999945163727},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.25189998745918274},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.16545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.16545","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.16545","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.49065858125686646,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"After":[0],"decades":[1],"of":[2],"use":[3,122],"in":[4,24,64,116,139,147,169],"dictation":[5,59],"and,":[6],"more":[7],"recently,":[8],"ambient":[9],"documentation,":[10],"speech":[11,28,81,130,171],"is":[12,174],"emerging":[13],"as":[14],"a":[15,79,158,177],"primary":[16],"modality":[17],"for":[18,54,70,77,84,100,180],"interacting":[19],"with":[20],"technology":[21],"and":[22,40,44,67,87,103,119,128,166,185],"AI":[23],"healthcare.":[25],"Yet":[26],"medical":[27,108,129,170],"recognition":[29,82],"remains":[30],"difficult:":[31],"systems":[32,138],"must":[33],"capture":[34],"specialized":[35,98],"terminology,":[36],"resolve":[37],"contextual":[38,104],"ambiguity,":[39],"render":[41],"measurements,":[42],"abbreviations,":[43],"clinical":[45,72,90,140,159],"shorthand":[46],"precisely.":[47],"Existing":[48],"solutions":[49],"are":[50],"typically":[51],"optimized":[52],"either":[53],"general-purpose":[55],"transcription":[56,95],"or":[57,144],"narrow":[58],"workflows,":[60],"limiting":[61],"their":[62,68],"reliability":[63],"safety-critical":[65],"settings":[66,141],"usefulness":[69],"broader":[71],"workflows.":[73],"We":[74,156],"introduce":[75],"Symphony":[76,92,134,173],"Speech-to-Text,":[78],"medical-grade":[80],"system":[83],"real-time":[85],"streaming":[86],"batch":[88,186],"file-based":[89],"use.":[91],"decomposes":[93],"the":[94],"process":[96],"into":[97],"components":[99],"recognition,":[101],"formatting,":[102],"correction":[105],"to":[106,162],"optimize":[107],"term":[109],"recall":[110],"while":[111,142],"producing":[112],"clinically":[113],"structured":[114],"text":[115],"real":[117],"time":[118],"adapting":[120],"across":[121],"cases.":[123],"Evaluations":[124],"on":[125],"public":[126],"benchmark":[127,160],"datasets":[131],"show":[132],"that":[133],"substantially":[135],"outperforms":[136],"state-of-the-art":[137],"matching":[143],"exceeding":[145],"them":[146],"general-domain":[148],"settings,":[149],"suggesting":[150],"robust":[151],"generalization":[152],"rather":[153],"than":[154],"overfitting.":[155],"release":[157],"dataset":[161],"support":[163],"reliable":[164],"validation":[165],"further":[167],"progress":[168],"recognition.":[172],"available":[175],"through":[176],"production-grade":[178],"API":[179],"live":[181],"dictation,":[182],"conversational":[183],"transcription,":[184],"audio":[187],"file":[188],"processing.":[189]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
