{"id":"https://openalex.org/W7148457867","doi":"https://doi.org/10.48550/arxiv.2604.00279","title":"The Geometry of Compromise: Unlocking Generative Capabilities via Controllable Modality Alignment","display_name":"The Geometry of Compromise: Unlocking Generative Capabilities via Controllable Modality Alignment","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7148457867","doi":"https://doi.org/10.48550/arxiv.2604.00279"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.00279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.00279","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132804194","display_name":"Hongyuan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Liu, Hongyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132745454","display_name":"Qinli Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Qinli","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132821940","display_name":"Wen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Wen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132809946","display_name":"Zhong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132830746","display_name":"Jiaming Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiaming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132823032","display_name":"Wei Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025958247","display_name":"Zhili Qin","orcid":"https://orcid.org/0009-0004-8030-9522"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Zhili","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132751570","display_name":"Jinxia Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Jinxia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132762131","display_name":"Junming Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Junming","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5132804194"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9848999977111816,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.004999999888241291,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.002099999925121665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/centroid","display_name":"Centroid","score":0.7516000270843506},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6037999987602234},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5372999906539917},{"id":"https://openalex.org/keywords/offset","display_name":"Offset (computer science)","score":0.5328999757766724},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4221000075340271},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4047999978065491},{"id":"https://openalex.org/keywords/boosting","display_name":"Boosting (machine learning)","score":0.3752000033855438},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.36880001425743103},{"id":"https://openalex.org/keywords/hierarchical-clustering","display_name":"Hierarchical clustering","score":0.3653999865055084}],"concepts":[{"id":"https://openalex.org/C146599234","wikidata":"https://www.wikidata.org/wiki/Q511093","display_name":"Centroid","level":2,"score":0.7516000270843506},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6037999987602234},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.582099974155426},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5372999906539917},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5360999703407288},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.5328999757766724},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.48330000042915344},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4047999978065491},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.39910000562667847},{"id":"https://openalex.org/C46686674","wikidata":"https://www.wikidata.org/wiki/Q466303","display_name":"Boosting (machine learning)","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.36880001425743103},{"id":"https://openalex.org/C92835128","wikidata":"https://www.wikidata.org/wiki/Q1277447","display_name":"Hierarchical clustering","level":3,"score":0.3653999865055084},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.3149999976158142},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C92757383","wikidata":"https://www.wikidata.org/wiki/Q382497","display_name":"Affine transformation","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.29660001397132874},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.289000004529953},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.2856999933719635},{"id":"https://openalex.org/C193581530","wikidata":"https://www.wikidata.org/wiki/Q683778","display_name":"Structured light","level":2,"score":0.27970001101493835},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2621999979019165},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2605000138282776},{"id":"https://openalex.org/C23379248","wikidata":"https://www.wikidata.org/wiki/Q200904","display_name":"Epipolar geometry","level":3,"score":0.259799987077713},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.2549000084400177},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.25209999084472656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.00279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.00279","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.00279","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5258432626724243,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"such":[3,35],"as":[4,24,36],"CLIP":[5],"learn":[6],"a":[7,21,76,80,123,144],"shared":[8],"embedding":[9],"space":[10],"for":[11,120],"images":[12],"and":[13,38,79,83,138,199,211],"text,":[14],"yet":[15],"their":[16],"representations":[17],"remain":[18],"geometrically":[19],"separated,":[20],"phenomenon":[22],"known":[23],"the":[25,59,65,72,86,90,101,140,170,186,206],"modality":[26,73,171],"gap.":[27],"This":[28],"gap":[29,74,172,187],"limits":[30],"tasks":[31],"requiring":[32],"cross-modal":[33,47,94,166],"interchangeability,":[34],"captioning":[37,200],"joint":[39],"clustering.":[40],"Existing":[41],"post-processing":[42],"approaches":[43],"can":[44],"partially":[45],"improve":[46],"compatibility;":[48],"however,":[49],"we":[50,115],"show":[51],"through":[52],"geometric":[53],"analysis":[54],"that":[55,85,126,161],"they":[56],"primarily":[57],"reduce":[58],"global":[60],"centroid":[61,136],"offset":[62],"while":[63,143],"leaving":[64],"underlying":[66],"distributional":[67,141],"mismatch":[68],"intact.":[69],"We":[70],"decompose":[71],"into":[75],"Centroid":[77],"Gap":[78,88,105],"Distribution":[81,87],"Gap,":[82],"demonstrate":[84,160],"is":[89,106,173,188],"true":[91],"predictor":[92],"of":[93],"task":[95],"quality":[96],"($R^2":[97,108],"=":[98,109],"0.986$),":[99],"whereas":[100],"commonly":[102],"used":[103],"Raw":[104],"misleading":[107],"0.691$).":[110],"Motivated":[111],"by":[112,175,190,203],"this":[113],"observation,":[114],"propose":[116],"TPC-CMA":[117],"(Three-Phase":[118],"Curriculum":[119],"Cross-Modal":[121],"Alignment),":[122],"fine-tuning":[124],"framework":[125],"explicitly":[127],"reduces":[128],"both":[129],"components.":[130],"The":[131],"proposed":[132],"CMA":[133],"jointly":[134],"mitigates":[135],"offsets":[137],"reshapes":[139],"structure,":[142],"three-phase":[145],"curriculum":[146],"with":[147,177],"gradient-aware":[148],"scheduling":[149],"progressively":[150],"introduces":[151],"alignment":[152,184],"during":[153],"training":[154],"to":[155,197],"enable":[156],"stable":[157],"optimization.":[158],"Experiments":[159],"our":[162],"method":[163],"significantly":[164],"improves":[165,194],"alignment.":[167],"With":[168],"$\u03b1_{\\text{target}}{=}0.05$,":[169],"reduced":[174,189],"66.6\\%":[176],"only":[178],"4.84\\%":[179],"accuracy":[180],"drop.":[181],"Under":[182],"stronger":[183],"($\u03b1_{\\text{target}}{=}0.5$),":[185],"82.3\\%,":[191],"clustering":[192],"ARI":[193],"from":[195],"0.318":[196],"0.516,":[198],"CIDEr":[201],"increases":[202],"57.1\\%":[204],"over":[205],"original":[207],"model.":[208],"Our":[209],"code":[210],"pre-trained":[212],"models":[213],"will":[214],"be":[215],"made":[216],"publicly":[217],"available":[218],"upon":[219],"acceptance.":[220]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
