{"id":"https://openalex.org/W4392909790","doi":"https://doi.org/10.1109/icassp48485.2024.10448079","title":"SlideSpeech: A Large Scale Slide-Enriched Audio-Visual Corpus","display_name":"SlideSpeech: A Large Scale Slide-Enriched Audio-Visual Corpus","publication_year":2024,"publication_date":"2024-03-18","ids":{"openalex":"https://openalex.org/W4392909790","doi":"https://doi.org/10.1109/icassp48485.2024.10448079"},"language":"en","primary_location":{"id":"doi:10.1109/icassp48485.2024.10448079","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448079","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016797242","display_name":"Haoxu Wang","orcid":"https://orcid.org/0000-0002-5430-0899"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]},{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Haoxu Wang","raw_affiliation_strings":["Wuhan University,School of Computer Science,Wuhan,China","Speech Lab of DAMO Academy, Alibaba Group, China","School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Computer Science,Wuhan,China","institution_ids":["https://openalex.org/I37461747"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100616400","display_name":"Fan Yu","orcid":"https://orcid.org/0000-0002-0576-7396"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fan Yu","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062644421","display_name":"Xian Shi","orcid":"https://orcid.org/0000-0002-4120-7361"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xian Shi","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010266872","display_name":"Yuezhang Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuezhang Wang","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055433405","display_name":"Shiliang Zhang","orcid":"https://orcid.org/0000-0001-9053-9314"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Zhang","raw_affiliation_strings":["Alibaba Group,Speech Lab of DAMO Academy,China","Speech Lab of DAMO Academy, Alibaba Group, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group,Speech Lab of DAMO Academy,China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Speech Lab of DAMO Academy, Alibaba Group, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100780143","display_name":"Ming Li","orcid":"https://orcid.org/0009-0009-3842-7337"},"institutions":[{"id":"https://openalex.org/I4210159968","display_name":"Duke Kunshan University","ror":"https://ror.org/04sr5ys16","country_code":"CN","type":"education","lineage":["https://openalex.org/I170897317","https://openalex.org/I37461747","https://openalex.org/I4210159968"]},{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Li","raw_affiliation_strings":["Wuhan University,School of Computer Science,Wuhan,China","Suzhou Municipal Key Laboratory of Multimodal Intelligent Systems, Duke Kunshan University, Kunshan, China","School of Computer Science, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Wuhan University,School of Computer Science,Wuhan,China","institution_ids":["https://openalex.org/I37461747"]},{"raw_affiliation_string":"Suzhou Municipal Key Laboratory of Multimodal Intelligent Systems, Duke Kunshan University, Kunshan, China","institution_ids":["https://openalex.org/I4210159968"]},{"raw_affiliation_string":"School of Computer Science, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5016797242"],"corresponding_institution_ids":["https://openalex.org/I37461747","https://openalex.org/I45928872"],"apc_list":null,"apc_paid":null,"fwci":2.7642,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.9081465,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"11076","last_page":"11080"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8626223206520081},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6573168039321899},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6229197382926941},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6031224131584167},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5061487555503845},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45090451836586},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4475138187408447},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.44432389736175537},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.4427626132965088}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8626223206520081},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6573168039321899},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6229197382926941},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6031224131584167},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5061487555503845},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45090451836586},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4475138187408447},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.44432389736175537},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.4427626132965088},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp48485.2024.10448079","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp48485.2024.10448079","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1506201762","https://openalex.org/W1984076147","https://openalex.org/W2043331815","https://openalex.org/W2089499735","https://openalex.org/W2125336414","https://openalex.org/W2127141656","https://openalex.org/W2142416747","https://openalex.org/W2169384404","https://openalex.org/W2514741789","https://openalex.org/W2766219058","https://openalex.org/W2808631503","https://openalex.org/W2886319145","https://openalex.org/W2890952074","https://openalex.org/W2891205112","https://openalex.org/W2892009249","https://openalex.org/W2936774411","https://openalex.org/W2962780374","https://openalex.org/W2984008963","https://openalex.org/W3015974384","https://openalex.org/W3016010032","https://openalex.org/W3097777922","https://openalex.org/W3162293946","https://openalex.org/W3181186176","https://openalex.org/W3198694222","https://openalex.org/W3203407300","https://openalex.org/W3205961173","https://openalex.org/W4214922754","https://openalex.org/W4225985539","https://openalex.org/W4283700324","https://openalex.org/W4283832239","https://openalex.org/W4296414218","https://openalex.org/W4297841641","https://openalex.org/W4385245566","https://openalex.org/W4385822246","https://openalex.org/W4385823274","https://openalex.org/W4386076133","https://openalex.org/W6754420807","https://openalex.org/W6755559483"],"related_works":["https://openalex.org/W2185469136","https://openalex.org/W2011264131","https://openalex.org/W4306353150","https://openalex.org/W2026860389","https://openalex.org/W8219677","https://openalex.org/W3216879894","https://openalex.org/W2890132085","https://openalex.org/W2168054807","https://openalex.org/W2058990474","https://openalex.org/W3207883763"],"abstract_inverted_index":{"Multi-Modal":[0],"automatic":[1],"speech":[2,16,139],"recognition":[3,17,140],"(ASR)":[4],"techniques":[5],"aim":[6],"to":[7,11],"leverage":[8],"additional":[9],"modalities":[10],"improve":[12],"the":[13,29,40,54,85,100,104,115,120,130,135],"performance":[14,141],"of":[15,31,42,56,80,91,122,137],"systems.":[18],"While":[19],"existing":[20],"approaches":[21],"primarily":[22],"focus":[23],"on":[24],"video":[25,148],"or":[26],"contextual":[27,126],"information,":[28],"utilization":[30],"extra":[32],"supplementary":[33,147],"textual":[34,144],"information":[35,52,113,145],"has":[36],"been":[37],"overlooked.":[38],"Recognizing":[39],"abundance":[41],"online":[43],"conference":[44],"videos":[45],"with":[46,68,77],"slides,":[47],"which":[48],"provide":[49],"rich":[50],"domain-specific":[51],"in":[53,114,129],"form":[55],"text":[57,112],"and":[58,106,125],"images,":[59],"we":[60,98,133],"release":[61],"SlideSpeech,":[62],"a":[63,88],"large-scale":[64],"audio-visual":[65],"corpus":[66,71,86,105],"enriched":[67],"slides.":[69,94,149],"The":[70],"contains":[72,87],"1,705":[73],"videos,":[74],"1,000+":[75],"hours,":[76],"473":[78],"hours":[79],"high-quality":[81],"transcribed":[82],"speech.":[83],"Moreover,":[84],"significant":[89],"amount":[90],"real-time":[92],"synchronized":[93],"In":[95],"this":[96],"work,":[97],"present":[99],"pipeline":[101],"for":[102,110],"constructing":[103],"propose":[107],"baseline":[108],"methods":[109,128],"utilizing":[111],"visual":[116],"slide":[117],"context.":[118],"Through":[119],"application":[121],"keyword":[123],"extraction":[124],"ASR":[127],"benchmark":[131],"system,":[132],"demonstrate":[134],"potential":[136],"improving":[138],"by":[142],"incorporating":[143],"from":[146]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":3}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
