{"id":"https://openalex.org/W4407170108","doi":"https://doi.org/10.48550/arxiv.2502.01341","title":"AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Document Understanding","display_name":"AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Document Understanding","publication_year":2025,"publication_date":"2025-02-03","ids":{"openalex":"https://openalex.org/W4407170108","doi":"https://doi.org/10.48550/arxiv.2502.01341"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.01341","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01341","pdf_url":"https://arxiv.org/pdf/2502.01341","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.01341","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054102478","display_name":"Ahmed Masry","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Masry, Ahmed","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100333337","display_name":"J.A. Rodr\u00edguez","orcid":"https://orcid.org/0000-0001-8561-093X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rodriguez, Juan A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101648221","display_name":"Tian-Yu Zhang","orcid":"https://orcid.org/0000-0001-5121-5229"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tianyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073554817","display_name":"Suyuchen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Suyuchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100406971","display_name":"Chao Wang","orcid":"https://orcid.org/0000-0002-3238-0090"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115057160","display_name":"Aarash Feizi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feizi, Aarash","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022759464","display_name":"Akshay Kalkunte Suresh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Suresh, Akshay Kalkunte","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103305840","display_name":"Abhay Puri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Puri, Abhay","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050075702","display_name":"Xiangru Jian","orcid":"https://orcid.org/0009-0004-7138-7078"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jian, Xiangru","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017357771","display_name":"Pierre\u2010Andr\u00e9 No\u00ebl","orcid":"https://orcid.org/0000-0001-6979-1873"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"No\u00ebl, Pierre-Andr\u00e9","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016214507","display_name":"Sathwik Tejaswi Madhusudhan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Madhusudhan, Sathwik Tejaswi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039660800","display_name":"Marco Pedersoli","orcid":"https://orcid.org/0000-0002-7601-8640"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pedersoli, Marco","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100691224","display_name":"Bang Liu","orcid":"https://orcid.org/0009-0005-7819-1876"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039831068","display_name":"Nicolas Chapados","orcid":"https://orcid.org/0000-0003-0249-7607"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chapados, Nicolas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086198262","display_name":"Yoshua Bengio","orcid":"https://orcid.org/0000-0002-9322-3515"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bengio, Yoshua","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067722075","display_name":"Enamul Hoque","orcid":"https://orcid.org/0000-0002-9789-6645"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hoque, Enamul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075885606","display_name":"Christopher Pal","orcid":"https://orcid.org/0000-0001-6534-2114"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pal, Christopher","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028905152","display_name":"Issam Laradji","orcid":"https://orcid.org/0000-0002-9713-3269"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Laradji, Issam H.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004903288","display_name":"David V\u00e1zquez","orcid":"https://orcid.org/0000-0002-2845-8158"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vazquez, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028946392","display_name":"Perouz Taslakian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taslakian, Perouz","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041111343","display_name":"Spandana Gella","orcid":"https://orcid.org/0000-0003-2725-4476"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gella, Spandana","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5041629023","display_name":"Sai Rajeswar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajeswar, Sai","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":22,"corresponding_author_ids":["https://openalex.org/A5054102478"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.989799976348877,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9837999939918518,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.913744330406189},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4239638149738312},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.35436761379241943},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3418429493904114},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.34072786569595337},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3228275775909424},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.10618606209754944}],"concepts":[{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.913744330406189},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4239638149738312},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.35436761379241943},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3418429493904114},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.34072786569595337},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3228275775909424},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.10618606209754944},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.01341","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01341","pdf_url":"https://arxiv.org/pdf/2502.01341","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.01341","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.01341","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.01341","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.01341","pdf_url":"https://arxiv.org/pdf/2502.01341","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4407170108.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W4388870064","https://openalex.org/W2210139803","https://openalex.org/W4235186151","https://openalex.org/W2054685365","https://openalex.org/W2056057048","https://openalex.org/W2667588871","https://openalex.org/W2272354214","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Aligning":[0],"visual":[1,27,58,90,113,137],"features":[2,28,59,91,114],"with":[3,39,158],"language":[4],"embeddings":[5],"is":[6,129],"a":[7,22,31,35,82,93],"key":[8],"challenge":[9],"in":[10],"vision-language":[11],"models":[12,18],"(VLMs).":[13],"The":[14],"performance":[15,152],"of":[16,64,96,119],"such":[17,48],"hinges":[19],"on":[20,161],"having":[21],"good":[23],"connector":[24],"that":[25,88,112,122,148],"maps":[26,89],"generated":[29],"by":[30,107],"vision":[32],"encoder":[33],"to":[34,56,74,92,110,117,154,178],"shared":[36],"embedding":[37,67],"space":[38,121],"the":[40,61,65,103,108,120,123],"LLM":[41,97,109,124],"while":[42],"preserving":[43],"semantic":[44],"similarity.":[45],"Existing":[46],"connectors,":[47],"as":[49],"multilayer":[50],"perceptrons":[51],"(MLPs),":[52],"lack":[53],"inductive":[54],"bias":[55],"constrain":[57],"within":[60],"linguistic":[62,104],"structure":[63],"LLM's":[66],"space,":[68],"making":[69],"them":[70],"data-hungry":[71],"and":[72,138,165,176],"prone":[73],"cross-modal":[75],"misalignment.":[76],"In":[77],"this":[78],"work,":[79],"we":[80],"propose":[81],"novel":[83],"vision-text":[84],"alignment":[85,156],"method,":[86],"AlignVLM,":[87],"weighted":[94],"average":[95],"text":[98],"embeddings.":[99],"Our":[100,144],"approach":[101],"leverages":[102],"priors":[105],"encoded":[106],"ensure":[111],"are":[115,141],"mapped":[116],"regions":[118],"can":[125],"effectively":[126],"interpret.":[127],"AlignVLM":[128,149],"particularly":[130],"effective":[131],"for":[132],"document":[133,162],"understanding":[134,163],"tasks,":[135],"where":[136],"textual":[139],"modalities":[140],"highly":[142],"correlated.":[143],"extensive":[145],"experiments":[146],"show":[147],"achieves":[150],"state-of-the-art":[151],"compared":[153],"prior":[155],"methods,":[157],"larger":[159],"gains":[160],"tasks":[164],"under":[166],"low-resource":[167],"setups.":[168],"We":[169],"provide":[170],"further":[171],"analysis":[172],"demonstrating":[173],"its":[174],"efficiency":[175],"robustness":[177],"noise.":[179]},"counts_by_year":[],"updated_date":"2026-04-15T08:11:43.952461","created_date":"2025-10-10T00:00:00"}
