{"id":"https://openalex.org/W4392903700","doi":"https://doi.org/10.1109/icassp48485.2024.10445977","title":"STYLECAP: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-Supervised Learning Models","display_name":"STYLECAP: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-Supervised Learning Models","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392903700","doi":"https://doi.org/10.1109/icassp48485.2024.10445977"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10445977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10445977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108574971","display_name":"Kazuki Yamauchi","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Kazuki Yamauchi","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068604686","display_name":"Yusuke Ijima","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yusuke Ijima","raw_affiliation_strings":["NTT Corporation,Japan","NTT Corporation, Japan"],"affiliations":[{"raw_affiliation_string":"NTT Corporation,Japan","institution_ids":["https://openalex.org/I2251713219"]},{"raw_affiliation_string":"NTT Corporation, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083394213","display_name":"Yuki Saito","orcid":"https://orcid.org/0000-0002-7967-2613"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yuki Saito","raw_affiliation_strings":["The University of Tokyo,Japan","The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo,Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5108574971"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":1.7227,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.8581361,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"11261","last_page":"11265"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8048880100250244},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.7081542015075684},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6858162879943848},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6058409214019775},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5999933481216431},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5936670303344727},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.5481715798377991},{"id":"https://openalex.org/keywords/style","display_name":"Style (visual arts)","score":0.52975994348526},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5297233462333679},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.44445762038230896},{"id":"https://openalex.org/keywords/natural-language-generation","display_name":"Natural language generation","score":0.42018017172813416}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8048880100250244},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.7081542015075684},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6858162879943848},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6058409214019775},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5999933481216431},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5936670303344727},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.5481715798377991},{"id":"https://openalex.org/C2776445246","wikidata":"https://www.wikidata.org/wiki/Q1792644","display_name":"Style (visual arts)","level":2,"score":0.52975994348526},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5297233462333679},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.44445762038230896},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.42018017172813416},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10445977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10445977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6100000143051147,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320338246","display_name":"ACT-X","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":34,"referenced_works":["https://openalex.org/W1956340063","https://openalex.org/W2061068689","https://openalex.org/W2101105183","https://openalex.org/W2506483933","https://openalex.org/W2890964092","https://openalex.org/W2905841571","https://openalex.org/W2963206148","https://openalex.org/W2972359262","https://openalex.org/W2997399314","https://openalex.org/W3004047823","https://openalex.org/W3137249133","https://openalex.org/W3206495532","https://openalex.org/W3209984917","https://openalex.org/W4221147462","https://openalex.org/W4280567182","https://openalex.org/W4282813413","https://openalex.org/W4307106676","https://openalex.org/W4309201339","https://openalex.org/W4365440898","https://openalex.org/W4367674159","https://openalex.org/W4375869257","https://openalex.org/W4375869364","https://openalex.org/W4384918448","https://openalex.org/W4385478010","https://openalex.org/W4389519254","https://openalex.org/W4392903074","https://openalex.org/W6678262379","https://openalex.org/W6682631176","https://openalex.org/W6761205521","https://openalex.org/W6778883912","https://openalex.org/W6791353385","https://openalex.org/W6803567076","https://openalex.org/W6846674738","https://openalex.org/W6854866820"],"related_works":["https://openalex.org/W2050523636","https://openalex.org/W3009270862","https://openalex.org/W2955859849","https://openalex.org/W2152921782","https://openalex.org/W382594479","https://openalex.org/W2470045054","https://openalex.org/W2575772232","https://openalex.org/W2132238464","https://openalex.org/W2151245229","https://openalex.org/W2140902089"],"abstract_inverted_index":{"We":[0,81,105],"propose":[1],"StyleCap,":[2],"a":[3,52,87,98],"method":[4,58],"to":[5],"generate":[6],"natural":[7,78],"language":[8,79,100],"descriptions":[9],"of":[10,18,34,42,75,148,153],"speaking":[11],"styles":[12],"appearing":[13],"in":[14,46],"speech.":[15],"Although":[16],"most":[17],"conventional":[19],"techniques":[20],"for":[21,59,116,130],"para-/non-linguistic":[22],"information":[23],"recognition":[24,44],"focus":[25],"on":[26],"the":[27,31,40,43,131,144],"category":[28],"classification":[29],"or":[30],"intensity":[32],"estimation":[33],"pre-defined":[35],"labels,":[36],"they":[37],"cannot":[38],"provide":[39],"reasoning":[41],"result":[45],"an":[47,56,107],"interpretable":[48],"manner.":[49],"StyleCap":[50,69,126,159],"is":[51,70],"first":[53],"step":[54],"towards":[55],"end-to-end":[57],"generating":[60],"speaking-style":[61,67,150,154],"prompts":[62],"from":[63],"speech,":[64],"i.e.,":[65],"automatic":[66],"captioning.":[68],"trained":[71],"with":[72],"paired":[73],"data":[74],"speech":[76,88,112,134],"and":[77,111,139,146],"descriptions.":[80],"train":[82],"neural":[83],"networks":[84],"that":[85,94,124],"convert":[86],"representation":[89,114],"vector":[90],"into":[91,97],"prefix":[92],"vectors":[93],"are":[95,160],"fed":[96],"large":[99],"model":[101],"(LLM)-based":[102],"text":[103,109,132],"decoder.":[104],"explore":[106],"appropriate":[108],"decoder":[110],"feature":[113],"suitable":[115],"this":[117],"new":[118],"task.":[119],"The":[120],"experimental":[121],"results":[122],"demonstrate":[123],"our":[125,158],"leveraging":[127],"richer":[128],"LLMs":[129],"decoder,":[133],"self-supervised":[135],"learning":[136],"(SSL)":[137],"features,":[138],"sentence":[140],"rephrasing":[141],"augmentation":[142],"improves":[143],"accuracy":[145],"diversity":[147],"generated":[149,156],"captions.":[151],"Samples":[152],"captions":[155],"by":[157],"publicly":[161],"available":[162],"<sup":[163],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[164],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>":[165],".":[166]},"counts_by_year":[{"year":2025,"cited_by_count":5}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
