{"id":"https://openalex.org/W7133535964","doi":"https://doi.org/10.48550/arxiv.2603.02767","title":"ITO: Images and Texts as One via Synergizing Multiple Alignment and Training-Time Fusion","display_name":"ITO: Images and Texts as One via Synergizing Multiple Alignment and Training-Time Fusion","publication_year":2026,"publication_date":"2026-03-03","ids":{"openalex":"https://openalex.org/W7133535964","doi":"https://doi.org/10.48550/arxiv.2603.02767"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.02767","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071582484","display_name":"Hanpeng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Hanpeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128096638","display_name":"Yaqian Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yaqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128054643","display_name":"Zidan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zidan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005741692","display_name":"Shuoxi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Shuoxi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102224112","display_name":"Zonglin Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zonglin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120826306","display_name":"Zihao Bo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bo, Zihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120826307","display_name":"Rinyoichi Takezoe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takezoe, Rinyoichi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128063568","display_name":"Kaiwen Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Kaiwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128095372","display_name":"Kun He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Kun","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5071582484"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5293999910354614,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5293999910354614,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.30160000920295715,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.07349999994039536,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.7878999710083008},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.633400022983551},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5958999991416931},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4717000126838684},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.38580000400543213},{"id":"https://openalex.org/keywords/image-fusion","display_name":"Image fusion","score":0.3855000138282776}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.7878999710083008},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7565000057220459},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6517000198364258},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.633400022983551},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5958999991416931},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.47200000286102295},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4717000126838684},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.38580000400543213},{"id":"https://openalex.org/C69744172","wikidata":"https://www.wikidata.org/wiki/Q860822","display_name":"Image fusion","level":3,"score":0.3855000138282776},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30730000138282776},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.30239999294281006},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.02767","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.02767","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.02767","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.02767","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.7097004652023315}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Image-text":[0],"contrastive":[1,123],"pretraining":[2],"has":[3],"become":[4],"a":[5,27,47,101],"dominant":[6],"paradigm":[7],"for":[8],"visual":[9],"representation":[10],"learning,":[11],"yet":[12],"existing":[13],"methods":[14],"often":[15,119],"yield":[16],"representations":[17],"that":[18,75,90],"remain":[19],"partially":[20],"organized":[21],"by":[22,41],"modality.":[23],"We":[24],"propose":[25],"ITO,":[26],"framework":[28],"addressing":[29],"this":[30],"limitation":[31],"through":[32],"two":[33],"synergistic":[34],"mechanisms.":[35],"Multimodal":[36],"multiple":[37,92],"alignment":[38,93],"enriches":[39],"supervision":[40],"mining":[42],"diverse":[43],"image-text":[44],"correspondences,":[45],"while":[46,91],"lightweight":[48],"training-time":[49,97],"multimodal":[50,85],"fusion":[51,59,98],"module":[52,60],"enforces":[53],"structured":[54],"cross-modal":[55],"interaction.":[56],"Crucially,":[57],"the":[58,66,107,116],"is":[61],"discarded":[62],"at":[63],"inference,":[64],"preserving":[65],"efficiency":[67],"of":[68],"standard":[69],"dual-encoder":[70],"architectures.":[71],"Extensive":[72],"experiments":[73],"show":[74],"ITO":[76],"consistently":[77],"outperforms":[78],"strong":[79],"baselines":[80],"across":[81],"classification,":[82],"retrieval,":[83],"and":[84,110],"benchmarks.":[86],"Our":[87],"analysis":[88],"reveals":[89],"drives":[94],"discriminative":[95],"power,":[96],"acts":[99],"as":[100],"critical":[102],"structural":[103],"regularizer":[104],"--":[105],"eliminating":[106],"modality":[108],"gap":[109],"stabilizing":[111],"training":[112],"dynamics":[113],"to":[114],"prevent":[115],"early":[117],"saturation":[118],"observed":[120],"in":[121],"aggressive":[122],"learning.":[124]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-05T00:00:00"}
