{"id":"https://openalex.org/W4304142059","doi":"https://doi.org/10.1145/3503161.3547728","title":"Seeing Speech: Magnetic Resonance Imaging-Based Vocal Tract Deformation Visualization Using Cross-Modal Transformer","display_name":"Seeing Speech: Magnetic Resonance Imaging-Based Vocal Tract Deformation Visualization Using Cross-Modal Transformer","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304142059","doi":"https://doi.org/10.1145/3503161.3547728"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547728","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547728","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013340793","display_name":"Kele Xu","orcid":"https://orcid.org/0000-0001-5997-5169"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kele Xu","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026070862","display_name":"Ming Feng","orcid":"https://orcid.org/0000-0001-9943-5941"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Feng","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101933776","display_name":"Weiquan Huang","orcid":"https://orcid.org/0000-0002-3780-9343"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weiquan Huang","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4842,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.59930752,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"6947","last_page":"6949"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7390488386154175},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.6436862945556641},{"id":"https://openalex.org/keywords/vocal-tract","display_name":"Vocal tract","score":0.6407976150512695},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5128793716430664},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.4703004062175751},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4134013056755066},{"id":"https://openalex.org/keywords/acoustics","display_name":"Acoustics","score":0.38816404342651367},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.341496080160141},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.17798328399658203},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.15168339014053345},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.101909339427948}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7390488386154175},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.6436862945556641},{"id":"https://openalex.org/C47401133","wikidata":"https://www.wikidata.org/wiki/Q748953","display_name":"Vocal tract","level":2,"score":0.6407976150512695},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5128793716430664},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.4703004062175751},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4134013056755066},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.38816404342651367},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.341496080160141},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.17798328399658203},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.15168339014053345},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.101909339427948},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3547728","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547728","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2008120082","https://openalex.org/W2099474155","https://openalex.org/W2400962265","https://openalex.org/W2917649276","https://openalex.org/W4244175094"],"related_works":["https://openalex.org/W2079194684","https://openalex.org/W2617269004","https://openalex.org/W35292311","https://openalex.org/W1748856376","https://openalex.org/W2046073792","https://openalex.org/W4254341835","https://openalex.org/W1591384192","https://openalex.org/W2099204336","https://openalex.org/W2067459736","https://openalex.org/W2105635394"],"abstract_inverted_index":{"As":[0],"an":[1,116],"essential":[2],"component":[3],"to":[4,16,37,63,123],"advance":[5],"speech":[6,10,146],"science,":[7],"understanding":[8,19],"of":[9,20,25,103],"production":[11],"can":[12,99,120,135],"be":[13,121,137],"greatly":[14],"helpful":[15,138],"improve":[17],"our":[18,61,133],"motor":[21],"control,":[22],"dynamical":[23],"systems":[24],"humans":[26],"during":[27],"natural":[28],"speech.":[29],"Different":[30],"medical":[31],"imaging":[32,46],"modalities":[33],"have":[34],"been":[35],"leveraged":[36],"visualize":[38,64,124],"the":[39,65,70,73,76,82,101,104,110,125,128],"dynamic":[40],"process,":[41],"in":[42,139],"which":[43,119],"Magnetic":[44],"resonance":[45],"(MRI)":[47],"provides":[48],"a":[49,85,90],"valuable":[50],"tool":[51],"for":[52,142],"evaluating":[53],"static":[54],"postures.":[55],"In":[56],"this":[57],"demo,":[58],"we":[59,98,114],"present":[60,115],"solution":[62,134],"vocal":[66,105],"tract":[67,106],"deformation,":[68],"leveraging":[69],"correlation":[71],"between":[72],"MRI":[74],"and":[75,89,148],"acoustical":[77,111],"signals.":[78,112],"We":[79,131],"first":[80],"formulate":[81],"problem":[83],"as":[84],"cross-modal":[86,92],"prediction":[87],"task":[88],"novel":[91],"Transformer":[93],"network":[94],"is":[95],"proposed.":[96],"Thus,":[97],"infer":[100],"deformation":[102,126],"by":[107],"only":[108],"utilizing":[109,127],"Then,":[113],"interactive":[117],"framework,":[118],"used":[122],"aforementioned":[129],"network.":[130],"hope":[132],"also":[136],"pronunciation":[140],"training":[141],"children":[143],"with":[144],"sound":[145],"disorders":[147],"second":[149],"language":[150],"learning.":[151]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
