{"id":"https://openalex.org/W4391021529","doi":"https://doi.org/10.1109/asru57964.2023.10389745","title":"Permod: Perceptually Grounded Voice Modification With Latent Diffusion Models","display_name":"Permod: Perceptually Grounded Voice Modification With Latent Diffusion Models","publication_year":2023,"publication_date":"2023-12-16","ids":{"openalex":"https://openalex.org/W4391021529","doi":"https://doi.org/10.1109/asru57964.2023.10389745"},"language":"en","primary_location":{"id":"doi:10.1109/asru57964.2023.10389745","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389745","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093013842","display_name":"Robin Netzorg","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Robin Netzorg","raw_affiliation_strings":["University of California,Berkeley","University of California, Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]},{"raw_affiliation_string":"University of California, Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083137649","display_name":"Ajil Jalal","orcid":"https://orcid.org/0009-0006-9244-8575"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ajil Jalal","raw_affiliation_strings":["University of California,Berkeley","University of California, Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]},{"raw_affiliation_string":"University of California, Berkeley","institution_ids":["https://openalex.org/I95457486"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093013843","display_name":"Luna McNulty","orcid":null},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Luna McNulty","raw_affiliation_strings":["Brown University"],"affiliations":[{"raw_affiliation_string":"Brown University","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068922218","display_name":"Gopala K. Anumanchipalli","orcid":"https://orcid.org/0000-0002-9714-7740"},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gopala Krishna Anumanchipalli","raw_affiliation_strings":["University of California,Berkeley","University of California, Berkeley"],"affiliations":[{"raw_affiliation_string":"University of California,Berkeley","institution_ids":["https://openalex.org/I95457486"]},{"raw_affiliation_string":"University of California, Berkeley","institution_ids":["https://openalex.org/I95457486"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5093013842"],"corresponding_institution_ids":["https://openalex.org/I95457486"],"apc_list":null,"apc_paid":null,"fwci":0.3497,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.67879157,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10863","display_name":"Voice and Speech Disorders","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/2737","display_name":"Physiology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.7864865660667419},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6838998794555664},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6792933940887451},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5797833800315857},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5151413679122925},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.47395801544189453},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3501344621181488},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.16092297434806824},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.11752155423164368}],"concepts":[{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.7864865660667419},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6838998794555664},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6792933940887451},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5797833800315857},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5151413679122925},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.47395801544189453},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3501344621181488},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.16092297434806824},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.11752155423164368},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru57964.2023.10389745","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru57964.2023.10389745","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320334111","display_name":"Innovation Fund","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1974031554","https://openalex.org/W1994997260","https://openalex.org/W2011585205","https://openalex.org/W2058411003","https://openalex.org/W2082927806","https://openalex.org/W2176409853","https://openalex.org/W2315535101","https://openalex.org/W2330562562","https://openalex.org/W2404317780","https://openalex.org/W2959300817","https://openalex.org/W3036167779","https://openalex.org/W3048244524","https://openalex.org/W3092028330","https://openalex.org/W3093575194","https://openalex.org/W3110257065","https://openalex.org/W3168867926","https://openalex.org/W3207575373","https://openalex.org/W4221147462","https://openalex.org/W4225329057","https://openalex.org/W4287029002","https://openalex.org/W4287083626","https://openalex.org/W4312933868","https://openalex.org/W4318752004","https://openalex.org/W4362598673","https://openalex.org/W4365503441","https://openalex.org/W4383957026","https://openalex.org/W4387195417","https://openalex.org/W4392903423","https://openalex.org/W6765775151","https://openalex.org/W6779823529","https://openalex.org/W6786375611","https://openalex.org/W6798447524","https://openalex.org/W6799926150","https://openalex.org/W6803078664","https://openalex.org/W6851290578"],"related_works":["https://openalex.org/W2375873920","https://openalex.org/W2146114872","https://openalex.org/W2392060890","https://openalex.org/W2392760275","https://openalex.org/W2083530853","https://openalex.org/W2009831055","https://openalex.org/W2393172683","https://openalex.org/W3211744874","https://openalex.org/W1994626569","https://openalex.org/W2368686738"],"abstract_inverted_index":{"Perceptual":[0],"modification":[1],"of":[2,56],"voice":[3,41,89,98,111],"is":[4,21],"an":[5,12,87],"elusive":[6],"goal.":[7],"While":[8],"non-experts":[9],"can":[10],"modify":[11,27,61,65],"image":[13],"or":[14],"sentence":[15],"perceptually":[16],"with":[17,99,121,135],"available":[18],"tools,":[19],"it":[20,36],"not":[22],"clear":[23],"how":[24,63],"to":[25,38,42,60,64,113],"similarly":[26],"speech":[28],"along":[29],"perceptual":[30,58,72,92,102,115,118,138],"axes.":[31],"Voice":[32],"conversion":[33],"does":[34],"make":[35],"possible":[37],"convert":[39],"one":[40],"another,":[43],"but":[44,143],"these":[45],"modifications":[46],"are":[47,67],"handled":[48],"by":[49],"black":[50],"box":[51],"models,":[52],"and":[53,62,90,95,126],"the":[54,100,136],"specifics":[55],"what":[57],"qualities":[59,93,139],"them":[66],"unclear.":[68],"Towards":[69],"allowing":[70],"greater":[71],"control":[73],"over":[74],"voice,":[75],"we":[76,129],"introduce":[77],"PerMod,":[78],"a":[79,91,97,109],"conditional":[80],"latent":[81],"diffusion":[82],"model":[83],"that":[84,131],"takes":[85],"in":[86],"input":[88],"vector,":[94],"produces":[96,133],"matching":[101],"qualities.":[103],"Unlike":[104],"prior":[105],"work,":[106],"PerMod":[107,132],"generates":[108],"new":[110],"corresponding":[112],"specific":[114],"modifications.":[116],"Evaluating":[117],"quality":[119],"vectors":[120],"RMSE":[122],"from":[123],"both":[124],"human":[125],"predicted":[127],"labels,":[128],"demonstrate":[130],"voices":[134],"desired":[137],"for":[140],"typical":[141],"voices,":[142],"performs":[144],"poorly":[145],"on":[146],"atypical":[147],"voices.":[148]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
