{"id":"https://openalex.org/W4224939064","doi":"https://doi.org/10.21437/odyssey.2022-19","title":"Improving the Naturalness of Simulated Conversations for End-to-End Neural Diarization","display_name":"Improving the Naturalness of Simulated Conversations for End-to-End Neural Diarization","publication_year":2022,"publication_date":"2022-06-17","ids":{"openalex":"https://openalex.org/W4224939064","doi":"https://doi.org/10.21437/odyssey.2022-19"},"language":"en","primary_location":{"id":"doi:10.21437/odyssey.2022-19","is_oa":false,"landing_page_url":"https://doi.org/10.21437/odyssey.2022-19","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Speaker and Language Recognition Workshop (Odyssey 2022)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018847693","display_name":"Natsuo Yamashita","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Natsuo Yamashita","raw_affiliation_strings":["The University of Tokyo"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026324656","display_name":"Shota Horiguchi","orcid":"https://orcid.org/0000-0002-3166-4956"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shota Horiguchi","raw_affiliation_strings":["Hitachi, Ltd"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd","institution_ids":["https://openalex.org/I65143321"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068174033","display_name":"Takeshi Homma","orcid":"https://orcid.org/0000-0003-3864-3848"},"institutions":[{"id":"https://openalex.org/I65143321","display_name":"Hitachi (Japan)","ror":"https://ror.org/02exqgm79","country_code":"JP","type":"company","lineage":["https://openalex.org/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takeshi Homma","raw_affiliation_strings":["Hitachi, Ltd"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd","institution_ids":["https://openalex.org/I65143321"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5018847693"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":2.2227,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.89319511,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"133","last_page":"140"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/naturalness","display_name":"Naturalness","score":0.9135615825653076},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6279503107070923},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5183461904525757},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3698023557662964},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.24904894828796387},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.11435583233833313},{"id":"https://openalex.org/keywords/particle-physics","display_name":"Particle physics","score":0.0796157717704773}],"concepts":[{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.9135615825653076},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6279503107070923},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5183461904525757},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3698023557662964},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.24904894828796387},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.11435583233833313},{"id":"https://openalex.org/C109214941","wikidata":"https://www.wikidata.org/wiki/Q18334","display_name":"Particle physics","level":1,"score":0.0796157717704773}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/odyssey.2022-19","is_oa":false,"landing_page_url":"https://doi.org/10.21437/odyssey.2022-19","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Speaker and Language Recognition Workshop (Odyssey 2022)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.699999988079071,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":48,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1965819578","https://openalex.org/W2001292406","https://openalex.org/W2038101708","https://openalex.org/W2083751884","https://openalex.org/W2146486095","https://openalex.org/W2219249508","https://openalex.org/W2221409856","https://openalex.org/W2404126548","https://openalex.org/W2516631658","https://openalex.org/W2585455474","https://openalex.org/W2596035415","https://openalex.org/W2638067502","https://openalex.org/W2696967604","https://openalex.org/W2803322398","https://openalex.org/W2884797218","https://openalex.org/W2896538040","https://openalex.org/W2936774411","https://openalex.org/W2952752702","https://openalex.org/W2963477857","https://openalex.org/W2964391865","https://openalex.org/W2972541922","https://openalex.org/W2972949456","https://openalex.org/W2973730722","https://openalex.org/W2981087920","https://openalex.org/W3006898571","https://openalex.org/W3008283340","https://openalex.org/W3008357631","https://openalex.org/W3015191643","https://openalex.org/W3015746570","https://openalex.org/W3016232124","https://openalex.org/W3016244460","https://openalex.org/W3027008958","https://openalex.org/W3095212884","https://openalex.org/W3097777922","https://openalex.org/W3105031100","https://openalex.org/W3115628053","https://openalex.org/W3145204487","https://openalex.org/W3162354890","https://openalex.org/W3178462146","https://openalex.org/W3197916665","https://openalex.org/W3212886388","https://openalex.org/W4220731890","https://openalex.org/W4248634141","https://openalex.org/W4283014282","https://openalex.org/W4287123283","https://openalex.org/W4288083483","https://openalex.org/W4385245566"],"related_works":["https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2029561777","https://openalex.org/W1554502231","https://openalex.org/W172797710","https://openalex.org/W2945105049","https://openalex.org/W4387098302","https://openalex.org/W3165080709","https://openalex.org/W2948317131","https://openalex.org/W4288365855"],"abstract_inverted_index":{"This":[0],"paper":[1],"investigates":[2],"a":[3,32,71],"method":[4,72,91,114],"for":[5],"simulating":[6],"natural":[7,75,108],"conversation":[8],"in":[9,51,125],"the":[10,19,42,52,85,122,128,139,146,152],"model":[11],"training":[12,53],"of":[13,21,54,87,99,127,154],"end-to-end":[14],"neural":[15],"diarization":[16,137],"(EEND).Due":[17],"to":[18,41,73,79,106,117,121,150],"lack":[20],"any":[22],"annotated":[23],"real":[24,44,123],"conversational":[25,35,76],"dataset,":[26],"EEND":[27],"is":[28],"usually":[29],"pretrained":[30],"on":[31,135],"large-scale":[33],"simulated":[34,111,147],"dataset":[36,110,124,148],"first":[37],"and":[38,102,130,141],"then":[39],"adapted":[40],"target":[43],"dataset.Simulated":[45],"datasets":[46,143],"play":[47],"an":[48,65],"essential":[49],"role":[50],"EEND,":[55],"but":[56],"as":[57],"yet":[58],"there":[59],"has":[60],"been":[61],"insufficient":[62],"investigation":[63],"into":[64,94],"optimal":[66],"simulation":[67],"method.We":[68],"thus":[69],"propose":[70],"simulate":[74,107],"speech.In":[77],"contrast":[78],"conventional":[80],"methods,":[81],"which":[82],"simply":[83],"combine":[84],"speech":[86],"multiple":[88],"speakers,":[89],"our":[90,113],"takes":[92],"turn-taking":[93],"account.We":[95],"define":[96],"four":[97],"types":[98],"speaker":[100],"transition":[101],"sequentially":[103],"arrange":[104],"them":[105],"conversations.The":[109],"using":[112,138],"was":[115],"found":[116],"be":[118],"statistically":[119],"similar":[120],"terms":[126],"silence":[129],"overlap":[131],"ratios.The":[132],"experimental":[133],"results":[134],"two-speaker":[136],"CALLHOME":[140],"CSJ":[142],"showed":[144],"that":[145],"contributes":[149],"improving":[151],"performance":[153],"EEND.":[155]},"counts_by_year":[{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":3}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
