{"id":"https://openalex.org/W7164854473","doi":"https://doi.org/10.1145/3805622.3810822","title":"Scaling Multimodal Retrieval and Generation for Long Documents through Visual Tiling and Context Compression","display_name":"Scaling Multimodal Retrieval and Generation for Long Documents through Visual Tiling and Context Compression","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164854473","doi":"https://doi.org/10.1145/3805622.3810822"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810822","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810822","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810822","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025866854","display_name":"Yi Jin","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi Jin","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-3426-3172","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100726332","display_name":"Weichao Chen","orcid":"https://orcid.org/0000-0002-7226-7885"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weichao Chen","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-7226-7885","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016465182","display_name":"Shengjie Zhao","orcid":"https://orcid.org/0000-0002-4301-394X"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shengjie Zhao","raw_affiliation_strings":["School of Computer Science and Technology, Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-4301-394X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93958424,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"128","last_page":"137"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9575999975204468,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9575999975204468,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.01860000006854534,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.004900000058114529,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.5831999778747559},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5658000111579895},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5284000039100647},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5182999968528748},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4512999951839447},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4124999940395355},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.4090999960899353},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.3540000021457672}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8312000036239624},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.5831999778747559},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5662000179290771},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5658000111579895},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5284000039100647},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5182999968528748},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4124999940395355},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4090999960899353},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.3540000021457672},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.34119999408721924},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.33970001339912415},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.31679999828338623},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30559998750686646},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.2992999851703644},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2851000130176544},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2694000005722046},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2678000032901764},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.2574000060558319},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.2556999921798706}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810822","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810822","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810822","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810822","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2954698171","https://openalex.org/W2963899988","https://openalex.org/W3120043490","https://openalex.org/W4213213306","https://openalex.org/W4252076394","https://openalex.org/W4285255856","https://openalex.org/W4385572090","https://openalex.org/W4389519872","https://openalex.org/W4402716330","https://openalex.org/W4404782234","https://openalex.org/W4404782265","https://openalex.org/W4412945734","https://openalex.org/W4413145389","https://openalex.org/W4413156814","https://openalex.org/W4415796209","https://openalex.org/W4415798003","https://openalex.org/W4415798413","https://openalex.org/W4416034444","https://openalex.org/W4416035813","https://openalex.org/W7131097871","https://openalex.org/W7133224090","https://openalex.org/W7133227460"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"retrieval-augmented":[1],"generation":[2,71],"(MRAG)":[3],"provides":[4],"a":[5,63,96,115,125,144],"powerful":[6],"paradigm":[7],"for":[8,66],"long-document":[9,163],"reasoning":[10],"by":[11,153],"integrating":[12],"retrieval":[13,29,69,113],"with":[14],"generative":[15],"modeling.":[16],"However,":[17],"scaling":[18],"MRAG":[19],"to":[20,26,46,133],"multi-page":[21],"documents":[22],"remains":[23],"challenging":[24],"due":[25],"noisy":[27],"cross-modal":[28],"and":[30,50,58,70,90,143,177],"the":[31,80],"quadratic":[32],"computational":[33],"cost":[34],"of":[35],"long-context":[36],"inference.":[37],"Existing":[38],"multimodal":[39,68,162],"large":[40],"language":[41],"models":[42],"(MLLMs)":[43],"lack":[44],"mechanisms":[45],"structure":[47],"visual":[48,88,108,127,141,174],"representations":[49],"efficiently":[51],"manage":[52],"contextual":[53,157],"memory,":[54],"limiting":[55],"their":[56],"scalability":[57],"generalization.":[59],"We":[60],"propose":[61],"LMDocRag,":[62],"principled":[64],"framework":[65],"efficient":[67],"over":[72],"long":[73,102],"documents.":[74],"Our":[75],"approach":[76],"is":[77],"based":[78],"on":[79,120,161],"insight":[81],"that":[82,100,131,148,166],"document":[83,97,103],"understanding":[84],"benefits":[85],"from":[86],"structured":[87,116],"decomposition":[89],"representation-level":[91],"compression.":[92],"Specifically,":[93],"we":[94,123],"introduce":[95],"tiling":[98],"strategy":[99],"transforms":[101],"images":[104],"into":[105],"semantically":[106],"localized":[107],"units,":[109],"enabling":[110],"unified":[111],"hybrid":[112],"in":[114],"embedding":[117],"space.":[118],"Building":[119],"this":[121],"decomposition,":[122],"develop":[124],"layout-guided":[126],"token":[128,175],"sparsification":[129],"method":[130],"learns":[132],"preserve":[134],"structurally":[135],"salient":[136],"regions":[137],"while":[138,171],"suppressing":[139],"redundant":[140],"representations,":[142],"KV-cache":[145],"compression":[146],"scheme":[147],"reduces":[149],"autoregressive":[150],"memory":[151],"growth":[152],"selectively":[154],"retaining":[155],"informative":[156],"states.":[158],"Extensive":[159],"experiments":[160],"benchmarks":[164],"demonstrate":[165],"LMDocRag":[167],"improves":[168],"question-answering":[169],"accuracy":[170],"substantially":[172],"reducing":[173],"count":[176],"inference":[178],"complexity.":[179]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
