{"id":"https://openalex.org/W4224916838","doi":"https://doi.org/10.1109/icassp43922.2022.9746969","title":"Multi-Modal Learning with Text Merging for TEXTVQA","display_name":"Multi-Modal Learning with Text Merging for TEXTVQA","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4224916838","doi":"https://doi.org/10.1109/icassp43922.2022.9746969"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9746969","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746969","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022636178","display_name":"Changsheng Xu","orcid":"https://orcid.org/0000-0001-8343-9665"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Changsheng Xu","raw_affiliation_strings":["Fudan University,School of Computer Science,Shanghai,China,200438"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science,Shanghai,China,200438","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101058795","display_name":"Zhenlong Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenlong Xu","raw_affiliation_strings":["Fudan University,School of Computer Science,Shanghai,China,200438"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science,Shanghai,China,200438","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101585745","display_name":"Yifan He","orcid":"https://orcid.org/0000-0002-9171-3502"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yifan He","raw_affiliation_strings":["Fudan University,School of Computer Science,Shanghai,China,200438"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science,Shanghai,China,200438","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017862559","display_name":"Shuigeng Zhou","orcid":"https://orcid.org/0000-0002-1949-2768"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuigeng Zhou","raw_affiliation_strings":["Fudan University,School of Computer Science,Shanghai,China,200438"],"affiliations":[{"raw_affiliation_string":"Fudan University,School of Computer Science,Shanghai,China,200438","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086316879","display_name":"Jihong Guan","orcid":"https://orcid.org/0000-0003-2313-7635"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jihong Guan","raw_affiliation_strings":["Tongji University,Dept. of Computer Sci. &amp; Techl,Shanghai,China,201804"],"affiliations":[{"raw_affiliation_string":"Tongji University,Dept. of Computer Sci. &amp; Techl,Shanghai,China,201804","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5022636178"],"corresponding_institution_ids":["https://openalex.org/I24943067"],"apc_list":null,"apc_paid":null,"fwci":0.0602,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.22693569,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"1985","last_page":"1989"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9919000267982483,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9890000224113464,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8361034393310547},{"id":"https://openalex.org/keywords/paragraph","display_name":"Paragraph","score":0.6310331225395203},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.613154411315918},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6051543354988098},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5949673056602478},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.5799845457077026},{"id":"https://openalex.org/keywords/merge","display_name":"Merge (version control)","score":0.571374237537384},{"id":"https://openalex.org/keywords/noisy-text-analytics","display_name":"Noisy text analytics","score":0.5583980083465576},{"id":"https://openalex.org/keywords/text-recognition","display_name":"Text recognition","score":0.5300167798995972},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.4255220293998718},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.40422767400741577},{"id":"https://openalex.org/keywords/text-graph","display_name":"Text graph","score":0.34804484248161316},{"id":"https://openalex.org/keywords/text-mining","display_name":"Text mining","score":0.330338716506958},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.22202882170677185}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8361034393310547},{"id":"https://openalex.org/C2777206241","wikidata":"https://www.wikidata.org/wiki/Q194431","display_name":"Paragraph","level":2,"score":0.6310331225395203},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.613154411315918},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6051543354988098},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5949673056602478},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.5799845457077026},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.571374237537384},{"id":"https://openalex.org/C151375590","wikidata":"https://www.wikidata.org/wiki/Q17147076","display_name":"Noisy text analytics","level":4,"score":0.5583980083465576},{"id":"https://openalex.org/C2983812711","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Text recognition","level":3,"score":0.5300167798995972},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.4255220293998718},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.40422767400741577},{"id":"https://openalex.org/C66945725","wikidata":"https://www.wikidata.org/wiki/Q18388823","display_name":"Text graph","level":3,"score":0.34804484248161316},{"id":"https://openalex.org/C71472368","wikidata":"https://www.wikidata.org/wiki/Q676880","display_name":"Text mining","level":2,"score":0.330338716506958},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.22202882170677185},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9746969","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746969","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6700000166893005,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1488163396","https://openalex.org/W1575833922","https://openalex.org/W1933349210","https://openalex.org/W2044883027","https://openalex.org/W2053317383","https://openalex.org/W2158878654","https://openalex.org/W2493916176","https://openalex.org/W2560730294","https://openalex.org/W2561715562","https://openalex.org/W2745461083","https://openalex.org/W2896457183","https://openalex.org/W2954165458","https://openalex.org/W2963717374","https://openalex.org/W2963954913","https://openalex.org/W2966715458","https://openalex.org/W2979382951","https://openalex.org/W2988326850","https://openalex.org/W3004268082","https://openalex.org/W3034336960","https://openalex.org/W4298392976","https://openalex.org/W6629203210","https://openalex.org/W6634232107","https://openalex.org/W6682086655","https://openalex.org/W6755207826","https://openalex.org/W6765121158","https://openalex.org/W6766904570"],"related_works":["https://openalex.org/W2152349655","https://openalex.org/W791363389","https://openalex.org/W3112257711","https://openalex.org/W2163264304","https://openalex.org/W2214611599","https://openalex.org/W2888961610","https://openalex.org/W2896618352","https://openalex.org/W4224916838","https://openalex.org/W3114256553","https://openalex.org/W2219066094"],"abstract_inverted_index":{"Text":[0,79],"visual":[1,10,121],"question":[2],"answer":[3],"(TextVQA)":[4],"is":[5,118],"an":[6],"important":[7],"task":[8],"of":[9,33,49,52],"text":[11,18,21,39,53,90,100,104,122],"understanding,":[12],"which":[13,64,94,117,164],"requires":[14],"to":[15,28,37,46,66,107,120,136],"understand":[16],"the":[17,47,98,103,132],"generated":[19],"by":[20],"recognition":[22,40,54,105],"module":[23,106,126],"and":[24,41,110],"provide":[25],"correct":[26],"answers":[27,140,170],"specific":[29],"questions.":[30],"Recent":[31],"works":[32],"TextVQA":[34],"have":[35],"tried":[36],"combine":[38],"multi-modal":[42,133],"learning.":[43],"However,":[44],"due":[45],"lack":[48],"effective":[50],"preprocessing":[51],"output,":[55],"existing":[56],"approaches":[57],"suffer":[58],"from":[59,102],"serious":[60],"contextual":[61],"information":[62],"missing,":[63],"leads":[65],"unsatisfactory":[67],"performance.":[68],"In":[69],"this":[70],"work,":[71],"we":[72,87],"propose":[73],"a":[74,89,148],"Multi-Modal":[75],"Learning":[76],"framework":[77,135],"with":[78],"Merging":[80],"(MML&TM":[81],"in":[82],"short)":[83],"for":[84,113,141,171],"TextVQA,":[85],"where":[86],"develop":[88],"merging":[91],"(TM)":[92],"algorithm,":[93],"can":[95,127,159],"effectively":[96],"merge":[97],"word-level":[99],"obtained":[101],"construct":[108],"line-level":[109],"paragraph-level":[111],"texts":[112],"enhancing":[114],"semantic":[115,162],"context,":[116],"crucial":[119],"understanding.":[123],"The":[124],"TM":[125,157],"be":[128],"easily":[129],"incorporated":[130],"into":[131],"learning":[134],"generate":[137,168],"more":[138],"comprehensive":[139],"TextVQA.":[142,172],"We":[143],"evaluate":[144],"our":[145,156],"method":[146],"on":[147],"public":[149],"dataset":[150],"ST-VQA.":[151],"Experimental":[152],"results":[153],"show":[154],"that":[155],"algorithm":[158],"obtain":[160],"complete":[161],"information,":[163],"subsequently":[165],"helps":[166],"MML&TM":[167],"better":[169]},"counts_by_year":[{"year":2022,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
