{"id":"https://openalex.org/W4415538072","doi":"https://doi.org/10.1145/3746027.3754931","title":"InstructCrop: Teaching Multimodal Large Language Models to Crop Aesthetic Images","display_name":"InstructCrop: Teaching Multimodal Large Language Models to Crop Aesthetic Images","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415538072","doi":"https://doi.org/10.1145/3746027.3754931"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754931","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754931","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5044278717","display_name":"Xiangfei Sheng","orcid":"https://orcid.org/0009-0004-8468-1970"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Xiangfei Sheng","raw_affiliation_strings":["School of Artificial Intelligence, Xidian University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Xidian University, Xi'an, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111226567","display_name":"Peirong Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pangu Xie","raw_affiliation_strings":["School of Artificial Intelligence, Xidian University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Xidian University, Xi'an, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111723399","display_name":"Zou Wei-Dong","orcid":null},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weidong Zou","raw_affiliation_strings":["School of Artificial Intelligence, Xidian University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Xidian University, Xi'an, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063520709","display_name":"Pengfei Chen","orcid":"https://orcid.org/0000-0002-0509-3782"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengfei Chen","raw_affiliation_strings":["School of Artificial Intelligence, Xidian University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Xidian University, Xi'an, China","institution_ids":["https://openalex.org/I149594827"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009067244","display_name":"Tong Zhu","orcid":"https://orcid.org/0000-0002-3082-7848"},"institutions":[{"id":"https://openalex.org/I37987034","display_name":"Guangzhou University","ror":"https://ror.org/05ar8rn06","country_code":"CN","type":"education","lineage":["https://openalex.org/I37987034"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tong Zhu","raw_affiliation_strings":["School of Computer Science and Network Engineering, Guangzhou University, Guangzhou, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Network Engineering, Guangzhou University, Guangzhou, China","institution_ids":["https://openalex.org/I37987034"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033615240","display_name":"Leida Li","orcid":"https://orcid.org/0000-0001-9069-8796"},"institutions":[{"id":"https://openalex.org/I149594827","display_name":"Xidian University","ror":"https://ror.org/05s92vm98","country_code":"CN","type":"education","lineage":["https://openalex.org/I149594827"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Leida Li","raw_affiliation_strings":["School of Artificial Intelligence and State Key Laboratory of Electromechanical Integrated Manufacturing of High-Performance Electronic Equipments, Xidian University, Xi'an, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence and State Key Laboratory of Electromechanical Integrated Manufacturing of High-Performance Electronic Equipments, Xidian University, Xi'an, China","institution_ids":["https://openalex.org/I149594827"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5044278717"],"corresponding_institution_ids":["https://openalex.org/I149594827"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.41241108,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"6830","last_page":"6839"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13071","display_name":"Digital Storytelling and Education","score":0.9782000184059143,"subfield":{"id":"https://openalex.org/subfields/3616","display_name":"Speech and Hearing"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T13071","display_name":"Digital Storytelling and Education","score":0.9782000184059143,"subfield":{"id":"https://openalex.org/subfields/3616","display_name":"Speech and Hearing"},"field":{"id":"https://openalex.org/fields/36","display_name":"Health Professions"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9154999852180481,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.911300003528595,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cropping","display_name":"Cropping","score":0.8367999792098999},{"id":"https://openalex.org/keywords/usability","display_name":"Usability","score":0.7350999712944031},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5647000074386597},{"id":"https://openalex.org/keywords/data-driven","display_name":"Data-driven","score":0.4779999852180481},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.41819998621940613},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.3986000120639801},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.34529998898506165}],"concepts":[{"id":"https://openalex.org/C13558536","wikidata":"https://www.wikidata.org/wiki/Q785116","display_name":"Cropping","level":3,"score":0.8367999792098999},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7465999722480774},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.7350999712944031},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5647000074386597},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.4779999852180481},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4713999927043915},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.41819998621940613},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.3986000120639801},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.38659998774528503},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38109999895095825},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3684999942779541},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.34529998898506165},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.334199994802475},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.32679998874664307},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.31940001249313354},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C2776394811","wikidata":"https://www.wikidata.org/wiki/Q3634497","display_name":"Cropping system","level":3,"score":0.2590000033378601},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754931","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754931","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2051596736","https://openalex.org/W2069639113","https://openalex.org/W2100618994","https://openalex.org/W2754213847","https://openalex.org/W2775725209","https://openalex.org/W2804743778","https://openalex.org/W2807107013","https://openalex.org/W2944041622","https://openalex.org/W2963010685","https://openalex.org/W2963150697","https://openalex.org/W2963312801","https://openalex.org/W2964332053","https://openalex.org/W2973725754","https://openalex.org/W3034512053","https://openalex.org/W3186050100","https://openalex.org/W4387969352","https://openalex.org/W4401726215","https://openalex.org/W4402704606","https://openalex.org/W4402754134","https://openalex.org/W4402978319","https://openalex.org/W4403532799","https://openalex.org/W4404102470"],"related_works":[],"abstract_inverted_index":{"Aesthetic":[0],"Image":[1,95],"Cropping":[2,96],"(AIC)":[3],"aims":[4],"to":[5,32,141],"improve":[6],"the":[7,21,62,113,124,143],"visual":[8],"appeal":[9],"of":[10,131],"images":[11],"by":[12,61,105,127],"removing":[13],"redundant":[14],"content":[15],"while":[16],"preserving":[17],"attractive":[18],"elements.":[19],"Despite":[20],"encouraging":[22],"progresses":[23],"achieved":[24],"in":[25,57],"data-driven":[26],"approaches,":[27],"most":[28],"existing":[29,114],"models":[30],"struggle":[31],"understand":[33,79],"user":[34,80,170],"intentions,":[35],"particularly":[36],"for":[37,86],"diversified":[38],"scenes":[39],"with":[40,169],"multiple":[41],"subjects.":[42],"Moreover,":[43],"they":[44],"can":[45,78],"only":[46],"provide":[47,83],"cropping":[48,87,115,125,144],"results":[49,145],"without":[50],"explanations,":[51],"which":[52,77,166],"further":[53],"restricts":[54],"their":[55],"usability":[56],"real-world":[58],"applications.":[59],"Motivated":[60],"above":[63],"facts,":[64],"we":[65,90,118,138],"introduce":[66],"InstructCrop":[67,159],":":[68],"a":[69,93,102],"multimodal":[70,94],"large":[71],"language":[72],"model":[73,126],"(MLLM)-based":[74],"AIC":[75],"framework,":[76],"instructions":[81],"and":[82,134,146,150,162,173],"explanatory":[84],"reasons":[85],"results.":[88],"Specifically,":[89],"first":[91],"build":[92],"Instruction":[97],"Tuning":[98],"(ICIT)":[99],"dataset":[100],"through":[101],"cost-effective":[103],"paradigm":[104],"generating":[106],"high-quality":[107],"instruction":[108],"tuning":[109],"data":[110],"based":[111],"on":[112,153],"datasets.":[116],"Then,":[117],"embed":[119],"dynamic":[120],"domain":[121],"knowledge":[122],"into":[123],"integrating":[128],"cropping-aware":[129],"experts":[130],"aesthetic":[132],"assessment":[133],"composition":[135],"classification.":[136],"Finally,":[137],"adapt":[139],"MLLMs":[140],"generate":[142],"corresponding":[147],"explanations.":[148],"Quantitative":[149],"qualitative":[151],"experiments":[152],"three":[154],"benchmark":[155],"datasets":[156],"demonstrate":[157],"that":[158],"enables":[160],"effective":[161],"interpretable":[163],"image":[164],"cropping,":[165],"aligns":[167],"better":[168],"intentions.":[171],"Data":[172],"code":[174],"are":[175],"available":[176],"at":[177],"https://github.com/sxfly99/InstructCrop.":[178]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-25T00:00:00"}
