{"id":"https://openalex.org/W7138180450","doi":"https://doi.org/10.1609/aaai.v40i4.37282","title":"MosaicDoc: A Large-Scale Bilingual Benchmark for Visually Rich Document Understanding","display_name":"MosaicDoc: A Large-Scale Bilingual Benchmark for Visually Rich Document Understanding","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138180450","doi":"https://doi.org/10.1609/aaai.v40i4.37282"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i4.37282","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i4.37282","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i4.37282","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089214262","display_name":"Ketong Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ketong Chen","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129751087","display_name":"Yuhao Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuhao Chen","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129653828","display_name":"Yang Xue","orcid":null},"institutions":[{"id":"https://openalex.org/I90610280","display_name":"South China University of Technology","ror":"https://ror.org/0530pts50","country_code":"CN","type":"education","lineage":["https://openalex.org/I90610280"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yang Xue","raw_affiliation_strings":["South China University of Technology"],"affiliations":[{"raw_affiliation_string":"South China University of Technology","institution_ids":["https://openalex.org/I90610280"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5089214262"],"corresponding_institution_ids":["https://openalex.org/I90610280"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.50298507,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"4","first_page":"2913","last_page":"2921"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5260000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5260000228881836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.24449999630451202,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.032999999821186066,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.8199999928474426},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.7106000185012817},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.7024000287055969},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5924000144004822},{"id":"https://openalex.org/keywords/path","display_name":"Path (computing)","score":0.5034000277519226},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.4587000012397766}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.8199999928474426},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7807999849319458},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.7106000185012817},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.7024000287055969},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5924000144004822},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5407000184059143},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.5034000277519226},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4587000012397766},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4447000026702881},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.3553999960422516},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.328900009393692},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3012000024318695},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.27410000562667847},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.2624000012874603}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i4.37282","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i4.37282","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i4.37282","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i4.37282","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.8789846897125244}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Despite":[0],"the":[1,84,134],"rapid":[2],"progress":[3],"of":[4,86,139],"Vision-Language":[5],"Models":[6,63],"(VLMs),":[7],"their":[8,146],"capabilities":[9],"are":[10,17],"inadequately":[11],"assessed":[12],"by":[13],"existing":[14],"benchmarks,":[15],"which":[16],"predominantly":[18],"English-centric,":[19],"feature":[20],"simplistic":[21],"layouts,":[22],"and":[23,46,78,91,96,101,109,117,122,154],"support":[24],"limited":[25],"tasks.":[26],"Consequently,":[27],"they":[28],"fail":[29],"to":[30,64,82],"evaluate":[31],"model":[32],"performance":[33],"for":[34,133,159],"Visually":[35],"Rich":[36],"Document":[37],"Understanding":[38],"(VRDU),":[39],"a":[40,55,67,74,130,156],"critical":[41],"challenge":[42],"involving":[43],"complex":[44,97],"layouts":[45,98],"dense":[47],"text.":[48],"To":[49],"address":[50],"this,":[51],"we":[52],"introduce":[53],"DocWeaver,":[54],"novel":[56],"multi-agent":[57],"pipeline":[58],"that":[59],"leverages":[60],"Large":[61],"Language":[62],"automatically":[65],"generate":[66],"new":[68],"benchmark.":[69],"The":[70],"result":[71],"is":[72],"MosaicDoc,":[73],"large-scale,":[75],"bilingual":[76],"(Chinese":[77],"English)":[79],"resource":[80],"designed":[81],"push":[83],"boundaries":[85],"VRDU.":[87],"Sourced":[88],"from":[89,106],"newspapers":[90],"magazines,":[92],"MosaicDoc":[93,127],"features":[94],"diverse":[95],"(including":[99],"multi-column":[100],"non-Manhattan),":[102],"rich":[103],"stylistic":[104],"variety":[105],"196":[107],"publishers,":[108],"comprehensive":[110],"multi-task":[111],"annotations":[112],"(OCR,":[113],"VQA,":[114],"reading":[115],"order,":[116],"localization).":[118],"With":[119],"72K":[120],"images":[121],"over":[123],"600K":[124],"QA":[125],"pairs,":[126],"serves":[128],"as":[129],"definitive":[131],"benchmark":[132,144],"field.":[135],"Our":[136],"extensive":[137],"evaluation":[138],"state-of-the-art":[140],"models":[141],"on":[142],"this":[143],"reveals":[145],"current":[147],"limitations":[148],"in":[149],"handling":[150],"real-world":[151],"document":[152],"complexity":[153],"charts":[155],"clear":[157],"path":[158],"future":[160],"research.":[161]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
