{"id":"https://openalex.org/W4403317954","doi":"https://doi.org/10.1145/3672539.3686738","title":"Real-Time Word-Level Temporal Segmentation in Streaming Speech Recognition","display_name":"Real-Time Word-Level Temporal Segmentation in Streaming Speech Recognition","publication_year":2024,"publication_date":"2024-10-11","ids":{"openalex":"https://openalex.org/W4403317954","doi":"https://doi.org/10.1145/3672539.3686738"},"language":"en","primary_location":{"id":"doi:10.1145/3672539.3686738","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3672539.3686738","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The 37th Annual ACM Symposium on User Interface Software and Technology","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2504.10849","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5006700923","display_name":"N. Nishida","orcid":"https://orcid.org/0000-0001-9966-4664"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Naoto Nishida","raw_affiliation_strings":["Ishiguro Lab, the University of Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0001-9966-4664","affiliations":[{"raw_affiliation_string":"Ishiguro Lab, the University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052724008","display_name":"Hirotaka Hiraki","orcid":"https://orcid.org/0000-0002-6543-4593"},"institutions":[{"id":"https://openalex.org/I73613424","display_name":"National Institute of Advanced Industrial Science and Technology","ror":"https://ror.org/01703db54","country_code":"JP","type":"government","lineage":["https://openalex.org/I73613424"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Hirotaka Hiraki","raw_affiliation_strings":["Rekimoto Lab, the University of Tokyo, Japan and National Institute of Advanced Industrial Science and Technology, Japan"],"raw_orcid":"https://orcid.org/0000-0002-6543-4593","affiliations":[{"raw_affiliation_string":"Rekimoto Lab, the University of Tokyo, Japan and National Institute of Advanced Industrial Science and Technology, Japan","institution_ids":["https://openalex.org/I73613424","https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082649952","display_name":"Jun Rekimoto","orcid":"https://orcid.org/0000-0002-3629-2514"},"institutions":[{"id":"https://openalex.org/I4210122684","display_name":"Sony Computer Science Laboratories","ror":"https://ror.org/02nc46417","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210122684"]},{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Jun Rekimoto","raw_affiliation_strings":["Rekimoto Lab, the University of Tokyo, Japan and Sony CSL Kyoto, Japan"],"raw_orcid":"https://orcid.org/0000-0002-3629-2514","affiliations":[{"raw_affiliation_string":"Rekimoto Lab, the University of Tokyo, Japan and Sony CSL Kyoto, Japan","institution_ids":["https://openalex.org/I4210122684","https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5069105845","display_name":"Yoshio Ishiguro","orcid":"https://orcid.org/0000-0002-1781-6212"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yoshio Ishiguro","raw_affiliation_strings":["Ishiguro Lab, the University of Tokyo, Japan"],"raw_orcid":"https://orcid.org/0000-0002-1781-6212","affiliations":[{"raw_affiliation_string":"Ishiguro Lab, the University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.18140866,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"3"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8142011165618896},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6928228139877319},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5204158425331116},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4877227246761322},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4617922306060791},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43817585706710815},{"id":"https://openalex.org/keywords/speech-segmentation","display_name":"Speech segmentation","score":0.4242815375328064},{"id":"https://openalex.org/keywords/text-segmentation","display_name":"Text segmentation","score":0.4160918593406677},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.06470674276351929}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8142011165618896},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6928228139877319},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5204158425331116},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4877227246761322},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4617922306060791},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43817585706710815},{"id":"https://openalex.org/C207030507","wikidata":"https://www.wikidata.org/wiki/Q2266173","display_name":"Speech segmentation","level":3,"score":0.4242815375328064},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.4160918593406677},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.06470674276351929},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3672539.3686738","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3672539.3686738","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The 37th Annual ACM Symposium on User Interface Software and Technology","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2504.10849","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.10849","pdf_url":"https://arxiv.org/pdf/2504.10849","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2504.10849","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.10849","pdf_url":"https://arxiv.org/pdf/2504.10849","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-sa","license_id":"https://openalex.org/licenses/cc-by-nc-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1063213209","display_name":null,"funder_award_id":"JPMJMS2012","funder_id":"https://openalex.org/F4320338247","funder_display_name":"Moonshot Research and Development Program"},{"id":"https://openalex.org/G4297759107","display_name":null,"funder_award_id":"JPJ012368C02901","funder_id":"https://openalex.org/F4320335839","funder_display_name":"National Institute of Information and Communications Technology"},{"id":"https://openalex.org/G6744148916","display_name":null,"funder_award_id":"JPMJCR17A3","funder_id":"https://openalex.org/F4320338075","funder_display_name":"Core Research for Evolutional Science and Technology"}],"funders":[{"id":"https://openalex.org/F4320335839","display_name":"National Institute of Information and Communications Technology","ror":"https://ror.org/016bgq349"},{"id":"https://openalex.org/F4320338075","display_name":"Core Research for Evolutional Science and Technology","ror":"https://ror.org/00097mb19"},{"id":"https://openalex.org/F4320338247","display_name":"Moonshot Research and Development Program","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403317954.pdf","grobid_xml":"https://content.openalex.org/works/W4403317954.grobid-xml"},"referenced_works_count":6,"referenced_works":["https://openalex.org/W4366547608","https://openalex.org/W4366588062","https://openalex.org/W4387793540","https://openalex.org/W4396832929","https://openalex.org/W4396833240","https://openalex.org/W4396833366"],"related_works":["https://openalex.org/W2514679778","https://openalex.org/W3026276030","https://openalex.org/W2105626703","https://openalex.org/W4396644016","https://openalex.org/W2978383222","https://openalex.org/W2172629291","https://openalex.org/W2380773642","https://openalex.org/W2037894725","https://openalex.org/W111933330","https://openalex.org/W2337707338"],"abstract_inverted_index":{"Rich-text":[0],"captions":[1],"are":[2],"essential":[3],"to":[4,29,50,88,106,156],"help":[5],"communication":[6],"for":[7],"Deaf":[8],"and":[9,15,37,57,165],"hard-of-hearing":[10],"(DHH)":[11],"people,":[12],"second-language":[13],"learners,":[14],"those":[16],"with":[17],"autism":[18],"spectrum":[19],"disorder":[20],"(ASD).":[21],"They":[22],"also":[23],"preserve":[24],"nuances":[25],"when":[26],"converting":[27],"speech":[28,40],"text,":[30],"enhancing":[31],"the":[32,48,60,64,74,79,95,98,110,119,123,142,158],"realism":[33],"of":[34,67,78,97,144],"presentation":[35],"scripts":[36],"conversation":[38],"or":[39,76],"logs.":[41],"However,":[42],"current":[43],"real-time":[44],"captioning":[45,167],"systems":[46],"lack":[47],"capability":[49],"alter":[51],"text":[52,120],"attributes":[53],"(ex.":[54],"capitalization,":[55],"sizes,":[56],"fonts)":[58],"at":[59,122],"word":[61,124,138],"level,":[62],"hindering":[63],"accurate":[65],"conveyance":[66],"speaker":[68],"intent":[69],"that":[70,117,136,152],"is":[71],"expressed":[72],"in":[73,126],"tones":[75],"intonations":[77],"speech.":[80],"For":[81],"example,":[82],"\u201cYOU":[83],"should":[84,102],"do":[85,103],"this\u201d":[86],"tends":[87,105],"be":[89,107],"considered":[90],"as":[91,94,109],"indicating":[92],"\u201cYou\u201d":[93],"focus":[96],"sentence,":[99],"whereas":[100],"\u201cYou":[101],"THIS\u201d":[104],"\u201cThis\u201d":[108],"focus.":[111],"This":[112],"paper":[113],"proposes":[114],"a":[115,130,162],"solution":[116],"changes":[118],"decorations":[121],"level":[125],"real":[127],"time.":[128],"As":[129],"prototype,":[131],"we":[132],"developed":[133],"an":[134],"application":[135],"adjusts":[137],"size":[139],"based":[140],"on":[141],"loudness":[143],"each":[145],"spoken":[146],"word.":[147],"Feedback":[148],"from":[149],"users":[150],"implies":[151],"this":[153],"system":[154],"helped":[155],"convey":[157],"speaker\u2019s":[159],"intent,":[160],"offering":[161],"more":[163],"engaging":[164],"accessible":[166],"experience.":[168]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
