{"id":"https://openalex.org/W4415432855","doi":"https://doi.org/10.21437/interspeech.2025-2147","title":"Conformer-based Ultrasound-to-Speech Conversion","display_name":"Conformer-based Ultrasound-to-Speech Conversion","publication_year":2025,"publication_date":"2025-08-17","ids":{"openalex":"https://openalex.org/W4415432855","doi":"https://doi.org/10.21437/interspeech.2025-2147"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2025-2147","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-2147","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084485509","display_name":"Ibrahim Ibrahimov","orcid":"https://orcid.org/0009-0009-3924-4663"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ibrahim Ibrahimov","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031237235","display_name":"Csaba Zaink\u00f3","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Csaba Zaink\u00f3","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5088559776","display_name":"G\u00e1bor Gosztolya","orcid":"https://orcid.org/0000-0002-2864-6466"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"G\u00e1bor Gosztolya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14147588,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"5578","last_page":"5582"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9221000075340271,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.7918000221252441},{"id":"https://openalex.org/keywords/conformational-isomerism","display_name":"Conformational isomerism","score":0.5404999852180481},{"id":"https://openalex.org/keywords/waveform","display_name":"Waveform","score":0.48590001463890076},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.428600013256073},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.4000000059604645},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.39820000529289246},{"id":"https://openalex.org/keywords/cepstrum","display_name":"Cepstrum","score":0.3693000078201294}],"concepts":[{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.7918000221252441},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6215999722480774},{"id":"https://openalex.org/C18705241","wikidata":"https://www.wikidata.org/wiki/Q1128023","display_name":"Conformational isomerism","level":3,"score":0.5404999852180481},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5105000138282776},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.48590001463890076},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4738999903202057},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.428600013256073},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.4000000059604645},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39820000529289246},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.3693000078201294},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3578000068664551},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.35269999504089355},{"id":"https://openalex.org/C177291462","wikidata":"https://www.wikidata.org/wiki/Q423038","display_name":"Active listening","level":2,"score":0.34869998693466187},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.32580000162124634},{"id":"https://openalex.org/C59656382","wikidata":"https://www.wikidata.org/wiki/Q191536","display_name":"Conjunction (astronomy)","level":2,"score":0.3052000105381012},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2786000072956085},{"id":"https://openalex.org/C151989614","wikidata":"https://www.wikidata.org/wiki/Q440370","display_name":"Mel-frequency cepstrum","level":3,"score":0.2574000060558319}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.21437/interspeech.2025-2147","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2025-2147","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:publicatio.bibl.u-szeged.hu:39052","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306400436","display_name":"SZTE Publicatio Repozit\u00f3rium (University of Szeged)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I227486990","host_organization_name":"University of Szeged","host_organization_lineage":["https://openalex.org/I227486990"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"NonPeerReviewed"},{"id":"pmh:oai:zenodo.org:16988824","is_oa":true,"landing_page_url":"https://doi.org/10.21437/Interspeech.2025-2147","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"INTERSPEECH, Interspeech 2025, Rotterdam, Netherlands, 17-21 August 2025","raw_type":"info:eu-repo/semantics/conferencePaper"}],"best_oa_location":{"id":"pmh:oai:publicatio.bibl.u-szeged.hu:39052","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306400436","display_name":"SZTE Publicatio Repozit\u00f3rium (University of Szeged)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I227486990","host_organization_name":"University of Szeged","host_organization_lineage":["https://openalex.org/I227486990"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":null,"raw_type":"NonPeerReviewed"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"&lt;p&gt;Deep":[0],"neural":[1],"networks":[2],"have":[3],"shown":[4],"promising":[5,129],"potential":[6],"for":[7,29,78,133],"ultrasound-to-speech":[8,134],"conversion":[9],"task":[10],"towards":[11],"Silent":[12],"Speech":[13],"Interfaces.":[14],"In":[15],"this":[16,30],"work,":[17],"we":[18],"applied":[19],"two":[20],"Conformer-based":[21,120],"DNN":[22],"architectures":[23],"(Base":[24],"and":[25,69],"one":[26],"with":[27,89,105,125],"bi-LSTM)":[28],"task.":[31],"Speaker-specific":[32],"models":[33],"were":[34,51],"trained":[35],"on":[36],"the":[37,43,47,99,102,123],"data":[38],"of":[39,101],"four":[40],"speakers":[41],"from":[42],"Ultrasuite-Tal80":[44],"dataset,":[45],"while":[46,95],"generated":[48],"mel":[49,70],"spectrograms":[50],"synthesized":[52],"to":[53,61,112,131],"audio":[54],"waveform":[55],"using":[56],"a":[57,62,82,106,128],"HiFi-GAN":[58],"vocoder.":[59],"Compared":[60],"standard":[63],"2D-CNN":[64],"baseline,":[65],"objective":[66],"measurements":[67],"(MSE":[68],"cepstral":[71],"distortion)":[72],"showed":[73],"no":[74],"statistically":[75],"significant":[76],"improvement":[77],"either":[79],"model.":[80],"However,":[81],"MUSHRA":[83],"listening":[84],"test":[85],"revealed":[86],"that":[87,119],"Conformer":[88,96,124],"bi-LSTM":[90],"provided":[91],"better":[92],"perceptual":[93],"quality,":[94],"Base":[97],"matched":[98],"performance":[100],"baseline":[103],"along":[104],"3&times;":[107],"faster":[108],"training":[109],"time":[110],"due":[111],"its":[113],"simpler":[114],"architecture.":[115],"These":[116],"findings":[117],"suggest":[118],"models,":[121],"especially":[122],"bi-LSTM,":[126],"offer":[127],"alternative":[130],"CNNs":[132],"conversion.&lt;/p&gt;":[135]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-23T00:00:00"}
