{"id":"https://openalex.org/W4389282358","doi":"https://doi.org/10.1145/3623809.3623837","title":"Multimodal Voice Activity Prediction: Turn-taking Events Detection in Expert-Novice Conversation","display_name":"Multimodal Voice Activity Prediction: Turn-taking Events Detection in Expert-Novice Conversation","publication_year":2023,"publication_date":"2023-12-03","ids":{"openalex":"https://openalex.org/W4389282358","doi":"https://doi.org/10.1145/3623809.3623837"},"language":"en","primary_location":{"id":"doi:10.1145/3623809.3623837","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3623809.3623837","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Conference on Human-Agent Interaction","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088110872","display_name":"K Onishi","orcid":"https://orcid.org/0009-0005-2807-2916"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Kazuyo Onishi","raw_affiliation_strings":["Nara Institute of Science and Technology, Japan"],"raw_orcid":"https://orcid.org/0009-0005-2807-2916","affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043099974","display_name":"Hiroki Tanaka","orcid":"https://orcid.org/0000-0002-0548-6252"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hiroki Tanaka","raw_affiliation_strings":["Nara Institute of Science and Technology, Japan"],"raw_orcid":"https://orcid.org/0000-0002-0548-6252","affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020994673","display_name":"Satoshi Nakamura","orcid":"https://orcid.org/0000-0001-6956-3803"},"institutions":[{"id":"https://openalex.org/I75917431","display_name":"Nara Institute of Science and Technology","ror":"https://ror.org/05bhada84","country_code":"JP","type":"education","lineage":["https://openalex.org/I75917431"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Satoshi Nakamura","raw_affiliation_strings":["Nara Institute of Science and Technology, Japan"],"raw_orcid":"https://orcid.org/0000-0001-6956-3803","affiliations":[{"raw_affiliation_string":"Nara Institute of Science and Technology, Japan","institution_ids":["https://openalex.org/I75917431"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5088110872"],"corresponding_institution_ids":["https://openalex.org/I75917431"],"apc_list":null,"apc_paid":null,"fwci":1.3633,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.8520674,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"13","last_page":"21"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10709","display_name":"Social Robot Interaction and HRI","score":0.9907000064849854,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7394499778747559},{"id":"https://openalex.org/keywords/conversation","display_name":"Conversation","score":0.7235637903213501},{"id":"https://openalex.org/keywords/gaze","display_name":"Gaze","score":0.6445317268371582},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.6084888577461243},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5686455368995667},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4518550932407379},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.44117027521133423},{"id":"https://openalex.org/keywords/turn-taking","display_name":"Turn-taking","score":0.4356379508972168},{"id":"https://openalex.org/keywords/computational-model","display_name":"Computational model","score":0.4307246208190918},{"id":"https://openalex.org/keywords/predictive-modelling","display_name":"Predictive modelling","score":0.4218888282775879},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34611403942108154},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.31741863489151},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.1430329978466034},{"id":"https://openalex.org/keywords/communication","display_name":"Communication","score":0.12648215889930725},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10503551363945007}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7394499778747559},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.7235637903213501},{"id":"https://openalex.org/C2779916870","wikidata":"https://www.wikidata.org/wiki/Q14467155","display_name":"Gaze","level":2,"score":0.6445317268371582},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.6084888577461243},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5686455368995667},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4518550932407379},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.44117027521133423},{"id":"https://openalex.org/C2776352735","wikidata":"https://www.wikidata.org/wiki/Q2313343","display_name":"Turn-taking","level":3,"score":0.4356379508972168},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.4307246208190918},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.4218888282775879},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34611403942108154},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.31741863489151},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.1430329978466034},{"id":"https://openalex.org/C46312422","wikidata":"https://www.wikidata.org/wiki/Q11024","display_name":"Communication","level":1,"score":0.12648215889930725},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10503551363945007},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C111368507","wikidata":"https://www.wikidata.org/wiki/Q43518","display_name":"Oceanography","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3623809.3623837","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3623809.3623837","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"International Conference on Human-Agent Interaction","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.800000011920929,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G4503388090","display_name":null,"funder_award_id":"JPMJCR19A","funder_id":"https://openalex.org/F4320338075","funder_display_name":"Core Research for Evolutional Science and Technology"}],"funders":[{"id":"https://openalex.org/F4320338075","display_name":"Core Research for Evolutional Science and Technology","ror":"https://ror.org/00097mb19"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W68208136","https://openalex.org/W348317515","https://openalex.org/W1508665521","https://openalex.org/W2008986570","https://openalex.org/W2056953935","https://openalex.org/W2063643001","https://openalex.org/W2100835029","https://openalex.org/W2100844198","https://openalex.org/W2100919116","https://openalex.org/W2105422377","https://openalex.org/W2110719102","https://openalex.org/W2144499799","https://openalex.org/W2275246416","https://openalex.org/W2492441051","https://openalex.org/W2515585303","https://openalex.org/W2610815425","https://openalex.org/W2747602812","https://openalex.org/W2755291975","https://openalex.org/W2786387151","https://openalex.org/W2807126412","https://openalex.org/W2888997666","https://openalex.org/W2901309320","https://openalex.org/W2914243039","https://openalex.org/W2962730651","https://openalex.org/W2973172454","https://openalex.org/W2980823180","https://openalex.org/W3112188842","https://openalex.org/W3196984997","https://openalex.org/W3199141871","https://openalex.org/W4297841632","https://openalex.org/W4306752156","https://openalex.org/W4319862469","https://openalex.org/W4323270439"],"related_works":["https://openalex.org/W1880689012","https://openalex.org/W3014378845","https://openalex.org/W2807877245","https://openalex.org/W2892813288","https://openalex.org/W2170584777","https://openalex.org/W2141666434","https://openalex.org/W4226330303","https://openalex.org/W2789189004","https://openalex.org/W2029883708","https://openalex.org/W2118699342"],"abstract_inverted_index":{"Predicting":[0],"the":[1,20,28,98,103,121,136,139,142,164,181],"timing":[2],"of":[3,30,105,141,183],"utterances":[4],"in":[5,50,176,189],"dyadic":[6,115,190],"conversations":[7],"is":[8],"essential":[9],"for":[10,185],"achieving":[11],"natural":[12],"interactions":[13],"between":[14],"humans":[15],"and":[16,69,89,119,148,166],"virtual":[17],"agents.":[18],"Since":[19],"former":[21],"often":[22],"use":[23],"non-verbal":[24,40,81,106,174],"cues":[25,107],"to":[26,101,163],"adjust":[27],"order":[29],"their":[31],"speech,":[32],"this":[33,76],"study":[34,170],"proposes":[35],"a":[36,43,57,114],"multimodal":[37,95],"model":[38,59,96,100,126],"incorporating":[39],"features":[41,63,137,175],"using":[42],"Transformer-based":[44,177],"voice":[45,66,70,109,187],"activity":[46,67,71,110,188],"prediction":[47,145,150],"model.":[48],"First,":[49],"line":[51],"with":[52,97,134],"previous":[53],"research,":[54],"we":[55,79],"reproduced":[56],"baseline":[58,77,99],"that":[60,130,172],"utilized":[61,113],"audio":[62],"(audio":[64],"waveform,":[65],"frame,":[68],"history)":[72],"as":[73],"inputs.":[74],"To":[75],"model,":[78],"added":[80],"features:":[82],"gaze":[83],"direction,":[84],"action":[85,158],"units,":[86],"head":[87],"pose,":[88],"articular":[90],"points.":[91],"We":[92,112],"compared":[93],"our":[94,131],"investigate":[102],"impact":[104],"on":[108],"prediction.":[111],"expert-novice":[116],"conversation":[117],"dataset":[118],"evaluated":[120],"average":[122],"outcomes":[123],"across":[124],"ten":[125],"trainings.":[127],"Results":[128],"revealed":[129],"proposed":[132],"models":[133,179,184],"all":[135],"improved":[138],"accuracy":[140],"next":[143],"speaker":[144],"by":[146,151],"2.3%":[147],"back-channeled":[149,167],"1.8%":[152],"(p-value":[153],"<":[154],"0.025).":[155],"In":[156],"particular,":[157],"units":[159],"may":[160],"contribute":[161],"significantly":[162],"turn-shift":[165],"predictions.":[168],"This":[169],"demonstrates":[171],"including":[173],"turn-taking":[178],"enhances":[180],"efficacy":[182],"predicting":[186],"conversations.":[191]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
