{"id":"https://openalex.org/W4372260358","doi":"https://doi.org/10.1109/icassp49357.2023.10096063","title":"WL-MSR: Watch and Listen for Multimodal Subtitle Recognition","display_name":"WL-MSR: Watch and Listen for Multimodal Subtitle Recognition","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372260358","doi":"https://doi.org/10.1109/icassp49357.2023.10096063"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10096063","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096063","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100320759","display_name":"Jiawei Liu","orcid":"https://orcid.org/0000-0003-4011-3950"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiawei Liu","raw_affiliation_strings":["Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","School of Artificial Intelligence, University of Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210094879"]},{"raw_affiliation_string":"The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101627309","display_name":"Hao Wang","orcid":"https://orcid.org/0000-0003-3847-524X"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Wang","raw_affiliation_strings":["Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","School of Artificial Intelligence, University of Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210094879"]},{"raw_affiliation_string":"The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100683352","display_name":"Weining Wang","orcid":"https://orcid.org/0000-0001-7299-6431"},"institutions":[{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weining Wang","raw_affiliation_strings":["Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210094879"]},{"raw_affiliation_string":"The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101777772","display_name":"Xingjian He","orcid":"https://orcid.org/0000-0001-5396-6253"},"institutions":[{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingjian He","raw_affiliation_strings":["Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","School of Artificial Intelligence, University of Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210094879"]},{"raw_affiliation_string":"The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108392430","display_name":"Jing Liu","orcid":"https://orcid.org/0000-0003-0903-9131"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210094879","display_name":"Shandong Institute of Automation","ror":"https://ror.org/00qdtba35","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210094879","https://openalex.org/I4210142748"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jing Liu","raw_affiliation_strings":["Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","School of Artificial Intelligence, University of Chinese Academy of Sciences","The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Chinese Academy of Sciences,The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210094879"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210165038"]},{"raw_affiliation_string":"The Laboratory of Cognition and Decision Intelligence for Complex Systems, Institute of Automation, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210094879","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100320759"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210094879","https://openalex.org/I4210112150","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":0.1228,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.37210227,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9980000257492065,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9925000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13310","display_name":"Subtitles and Audiovisual Media","score":0.9833999872207642,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/subtitle","display_name":"Subtitle","score":0.9104216694831848},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7667667865753174},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.7096790075302124},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6994290947914124},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5294931530952454},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.49819183349609375},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4525536596775055},{"id":"https://openalex.org/keywords/filter","display_name":"Filter (signal processing)","score":0.4123189449310303},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.38772112131118774},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.23558497428894043},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1332434117794037},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09640729427337646}],"concepts":[{"id":"https://openalex.org/C2780364048","wikidata":"https://www.wikidata.org/wiki/Q204028","display_name":"Subtitle","level":2,"score":0.9104216694831848},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7667667865753174},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.7096790075302124},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6994290947914124},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5294931530952454},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.49819183349609375},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4525536596775055},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.4123189449310303},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38772112131118774},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.23558497428894043},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1332434117794037},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09640729427337646},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10096063","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10096063","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2127141656","https://openalex.org/W2194187530","https://openalex.org/W2519818067","https://openalex.org/W2607735137","https://openalex.org/W2896457183","https://openalex.org/W2933138175","https://openalex.org/W2963242190","https://openalex.org/W2964312704","https://openalex.org/W2981728176","https://openalex.org/W3003868038","https://openalex.org/W3015783745","https://openalex.org/W3016167541","https://openalex.org/W3018962734","https://openalex.org/W3036601975","https://openalex.org/W3090449556","https://openalex.org/W3097148135","https://openalex.org/W3173220247","https://openalex.org/W3181016597","https://openalex.org/W3181186176","https://openalex.org/W3197478142","https://openalex.org/W4312662670","https://openalex.org/W4385245566","https://openalex.org/W6620707391","https://openalex.org/W6687615561","https://openalex.org/W6739901393","https://openalex.org/W6755207826","https://openalex.org/W6776721752"],"related_works":["https://openalex.org/W1950334511","https://openalex.org/W3119551990","https://openalex.org/W2351264416","https://openalex.org/W3004133161","https://openalex.org/W2370494932","https://openalex.org/W2975517425","https://openalex.org/W2952626934","https://openalex.org/W2349976842","https://openalex.org/W3172548481","https://openalex.org/W2352608602"],"abstract_inverted_index":{"Video":[0],"subtitles":[1,10],"could":[2],"be":[3],"defined":[4],"as":[5],"the":[6,55,87,93,99,128],"combination":[7],"of":[8,92,119],"visualized":[9],"in":[11,24,102,131],"frames":[12],"and":[13,30,40,63,77,80,90,116],"textual":[14,88],"content":[15],"recognized":[16],"from":[17],"speech,":[18],"which":[19],"play":[20],"a":[21,37,72],"significant":[22],"role":[23],"video":[25,51],"understanding":[26],"for":[27,42],"both":[28,86],"humans":[29],"machines.":[31],"In":[32],"this":[33],"paper,":[34],"we":[35,70,107],"propose":[36],"novel":[38],"Watch":[39],"Listen":[41],"Multimodal":[43,132],"Subtitle":[44,133],"Recognition":[45,61,66,134],"(WL-MSR)":[46],"framework":[47],"to":[48,84],"obtain":[49],"comprehensive":[50],"subtitles,":[52],"by":[53,58],"fusing":[54],"information":[56],"provided":[57],"Optical":[59],"Character":[60],"(OCR)":[62],"Automatic":[64],"Speech":[65],"(ASR)":[67],"models.":[68],"Specifically,":[69],"build":[71],"Transformer":[73],"model":[74],"with":[75],"mask":[76],"crop":[78],"strategies":[79],"multi-level":[81],"identity":[82],"embeddings":[83],"aggregate":[85],"results":[89,104,115],"features":[91],"two":[94],"modalities.":[95],"To":[96],"pre-filter":[97],"out":[98],"noise":[100],"items":[101],"OCR":[103,110],"before":[105],"fusion,":[106],"adopt":[108],"an":[109],"filter":[111],"based":[112],"on":[113,136],"ASR":[114],"confidence":[117],"scores":[118],"OCR.":[120],"By":[121],"combining":[122],"these":[123],"techniques,":[124],"our":[125],"solution":[126],"wins":[127],"2nd":[129],"place":[130],"Challenge":[135],"ICPR2022.":[137]},"counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
