{"id":"https://openalex.org/W4416613166","doi":"https://doi.org/10.1109/iccv51701.2025.00195","title":"Instruction-Oriented Preference Alignment for Enhancing Multi-Modal Comprehension Capability of MLLMs","display_name":"Instruction-Oriented Preference Alignment for Enhancing Multi-Modal Comprehension Capability of MLLMs","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4416613166","doi":"https://doi.org/10.1109/iccv51701.2025.00195"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.00195","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00195","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.20309","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101845790","display_name":"Zitian Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zitian Wang","raw_affiliation_strings":["Beihang University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yue Liao","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Yue Liao","raw_affiliation_strings":["National University of Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National University of Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088332394","display_name":"Rong Kang","orcid":"https://orcid.org/0000-0001-5998-492X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang Rong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076304703","display_name":"Fengyun Rao","orcid":"https://orcid.org/0000-0002-2868-2088"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fengyun Rao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103072699","display_name":"Yibo Yang","orcid":"https://orcid.org/0000-0003-0530-7231"},"institutions":[{"id":"https://openalex.org/I71920554","display_name":"King Abdullah University of Science and Technology","ror":"https://ror.org/01q3tbs38","country_code":"SA","type":"education","lineage":["https://openalex.org/I71920554"]}],"countries":["SA"],"is_corresponding":false,"raw_author_name":"Yibo Yang","raw_affiliation_strings":["King Abdullah University of Science and Technology"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"King Abdullah University of Science and Technology","institution_ids":["https://openalex.org/I71920554"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100330142","display_name":"Si Liu","orcid":"https://orcid.org/0000-0003-3578-7432"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Si Liu","raw_affiliation_strings":["Beihang University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101845790"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33905029,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2010","last_page":"2021"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4056999981403351,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.4056999981403351,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.33550000190734863,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.051500000059604645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.8050000071525574},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6349999904632568},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.6136999726295471},{"id":"https://openalex.org/keywords/preference-learning","display_name":"Preference learning","score":0.49459999799728394},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.4778999984264374},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.40540000796318054},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3919000029563904}],"concepts":[{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.8050000071525574},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6437000036239624},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6349999904632568},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.6136999726295471},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5863999724388123},{"id":"https://openalex.org/C181204326","wikidata":"https://www.wikidata.org/wiki/Q7239820","display_name":"Preference learning","level":3,"score":0.49459999799728394},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.49050000309944153},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.4778999984264374},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47029998898506165},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.40540000796318054},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3257000148296356},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3084000051021576},{"id":"https://openalex.org/C2777868144","wikidata":"https://www.wikidata.org/wiki/Q7239817","display_name":"Preference elicitation","level":3,"score":0.29409998655319214},{"id":"https://openalex.org/C2910998592","wikidata":"https://www.wikidata.org/wiki/Q2421902","display_name":"Hand preference","level":3,"score":0.2919999957084656},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.2685999870300293}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.00195","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.00195","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.20309","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20309","pdf_url":"https://arxiv.org/pdf/2503.20309","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.20309","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.20309","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.20309","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20309","pdf_url":"https://arxiv.org/pdf/2503.20309","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1669214561","display_name":null,"funder_award_id":"62461160308,U23B2010","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8986216942","display_name":null,"funder_award_id":"2022ZD0115502","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Preference":[0,53],"alignment":[1,24,63],"has":[2],"emerged":[3],"as":[4],"an":[5,73],"effective":[6],"strategy":[7],"to":[8,60,134],"enhance":[9,135],"the":[10,32],"performance":[11],"of":[12],"Multimodal":[13],"Large":[14],"Language":[15],"Models":[16],"(MLLMs)":[17],"following":[18],"supervised":[19],"fine-tuning.":[20],"While":[21],"existing":[22],"preference":[23,75,98],"methods":[25],"predominantly":[26],"target":[27],"hallucination":[28,44,122],"factors,":[29,86],"they":[30],"overlook":[31],"factors":[33],"essential":[34],"for":[35],"multi-modal":[36],"comprehension":[37],"capabilities,":[38],"often":[39],"narrowing":[40],"their":[41],"improvements":[42],"on":[43,113],"mitigation.":[45],"To":[46],"bridge":[47],"this":[48],"gap,":[49],"we":[50],"propose":[51],"Instruction-oriented":[52],"Alignment":[54],"(IPA),":[55],"a":[56,79,96],"scalable":[57],"framework":[58],"designed":[59],"automatically":[61],"construct":[62],"preferences":[64],"grounded":[65],"in":[66,90],"instruction":[67],"fulfillment":[68],"efficacy.":[69],"Our":[70],"method":[71],"involves":[72],"automated":[74],"construction":[76],"coupled":[77],"with":[78],"dedicated":[80],"verification":[81],"process":[82],"that":[83],"identifies":[84],"instruction-oriented":[85],"avoiding":[87],"significant":[88],"variability":[89],"response":[91],"representations.":[92],"Additionally,":[93],"IPA":[94],"incorporates":[95],"progressive":[97],"collection":[99],"pipeline,":[100],"further":[101],"recalling":[102],"challenging":[103],"samples":[104],"through":[105],"model":[106],"self-evolution":[107],"and":[108,127],"reference-guided":[109],"refinement.":[110],"Experiments":[111],"conducted":[112],"Qwen2VL-7B":[114],"demonstrate":[115],"IPA's":[116],"effectiveness":[117],"across":[118],"multiple":[119],"benchmarks,":[120],"including":[121],"evaluation,":[123],"visual":[124],"question":[125],"answering,":[126],"text":[128],"understanding":[129],"tasks,":[130],"highlighting":[131],"its":[132],"capability":[133],"general":[136],"comprehension.":[137]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
