{"id":"https://openalex.org/W7147075592","doi":"https://doi.org/10.48550/arxiv.2603.26683","title":"LITTA: Late-Interaction and Test-Time Alignment for Visually-Grounded Multimodal Retrieval","display_name":"LITTA: Late-Interaction and Test-Time Alignment for Visually-Grounded Multimodal Retrieval","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7147075592","doi":"https://doi.org/10.48550/arxiv.2603.26683"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.26683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.26683","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132574682","display_name":"Seonok Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kim, Seonok","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5132574682"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7753999829292297,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7753999829292297,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.08940000087022781,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.023600000888109207,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mean-reciprocal-rank","display_name":"Mean reciprocal rank","score":0.5295000076293945},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5268999934196472},{"id":"https://openalex.org/keywords/query-expansion","display_name":"Query expansion","score":0.5013999938964844},{"id":"https://openalex.org/keywords/image-retrieval","display_name":"Image retrieval","score":0.47530001401901245},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.4505000114440918},{"id":"https://openalex.org/keywords/visual-word","display_name":"Visual Word","score":0.3749000132083893},{"id":"https://openalex.org/keywords/document-retrieval","display_name":"Document retrieval","score":0.3377000093460083}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.824999988079071},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.7541999816894531},{"id":"https://openalex.org/C44083865","wikidata":"https://www.wikidata.org/wiki/Q3853443","display_name":"Mean reciprocal rank","level":2,"score":0.5295000076293945},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5268999934196472},{"id":"https://openalex.org/C99016210","wikidata":"https://www.wikidata.org/wiki/Q5488129","display_name":"Query expansion","level":2,"score":0.5013999938964844},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.47530001401901245},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.4505000114440918},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3765000104904175},{"id":"https://openalex.org/C189391414","wikidata":"https://www.wikidata.org/wiki/Q7936579","display_name":"Visual Word","level":4,"score":0.3749000132083893},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.3377000093460083},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C192028432","wikidata":"https://www.wikidata.org/wiki/Q845739","display_name":"Query language","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C2779532271","wikidata":"https://www.wikidata.org/wiki/Q445558","display_name":"Relevance feedback","level":4,"score":0.28780001401901245},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.26489999890327454},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C90288658","wikidata":"https://www.wikidata.org/wiki/Q3318149","display_name":"Human\u2013computer information retrieval","level":3,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.26683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.26683","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.26683","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Retrieving":[0],"relevant":[1],"evidence":[2,40,93],"from":[3,81],"visually":[4,122,196],"rich":[5],"documents":[6],"such":[7],"as":[8],"textbooks,":[9],"technical":[10],"reports,":[11],"and":[12,22,29,65,95,133,143,158],"manuals":[13],"is":[14,165],"challenging":[15],"due":[16],"to":[17,91,98,146],"long":[18],"context,":[19],"complex":[20],"layouts,":[21],"weak":[23],"lexical":[24],"overlap":[25],"between":[26],"user":[27,53],"questions":[28],"supporting":[30],"pages.":[31],"We":[32,118],"propose":[33],"LITTA,":[34],"a":[35,52,61,73,189],"query-expansion-centric":[36],"retrieval":[37,42,47,108,125,137],"framework":[38],"for":[39,69,177,194],"page":[41],"that":[43,185],"improves":[44,107,139],"multimodal":[45,115,198],"document":[46,124],"without":[48],"retriever":[49,76],"retraining.":[50],"Given":[51],"query,":[54],"LITTA":[55,120,175],"generates":[56],"complementary":[57],"query":[58,172,186],"variants":[59],"using":[60,72],"large":[62,151],"language":[63],"model":[64],"retrieves":[66],"candidate":[67],"pages":[68],"each":[70],"variant":[71],"frozen":[74],"vision":[75],"with":[77,113,149,155],"late-interaction":[78],"scoring.":[79],"Candidates":[80],"expanded":[82],"queries":[83],"are":[84],"then":[85],"aggregated":[86],"through":[87],"reciprocal":[88],"rank":[89],"fusion":[90],"improve":[92],"coverage":[94],"reduce":[96],"sensitivity":[97],"any":[99],"single":[100],"phrasing.":[101],"This":[102],"simple":[103,190],"test-time":[104],"strategy":[105],"significantly":[106],"robustness":[109],"while":[110],"remaining":[111],"compatible":[112],"existing":[114],"embedding":[116],"indices.":[117],"evaluate":[119],"on":[121],"grounded":[123,197],"tasks":[126],"across":[127],"three":[128],"domains:":[129],"computer":[130],"science,":[131],"pharmaceuticals,":[132],"industrial":[134],"manuals.":[135],"Multi-query":[136],"consistently":[138],"top-k":[140],"accuracy,":[141],"recall,":[142],"MRR":[144],"compared":[145],"single-query":[147],"retrieval,":[148],"particularly":[150],"gains":[152],"in":[153],"domains":[154],"high":[156],"visual":[157],"semantic":[159],"variability.":[160],"Moreover,":[161],"the":[162,169],"accuracy-efficiency":[163],"trade-off":[164],"directly":[166],"controllable":[167],"by":[168],"number":[170],"of":[171],"variants,":[173],"making":[174],"practical":[176],"deployment":[178],"under":[179],"latency":[180],"constraints.":[181],"These":[182],"results":[183],"demonstrate":[184],"expansion":[187],"provides":[188],"yet":[191],"effective":[192],"mechanism":[193],"improving":[195],"retrieval.":[199]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
