{"id":"https://openalex.org/W7162552989","doi":"https://doi.org/10.48550/arxiv.2605.26396","title":"Advancing Creative Physical Intelligence in Large Multimodal Models","display_name":"Advancing Creative Physical Intelligence in Large Multimodal Models","publication_year":2026,"publication_date":"2026-05-25","ids":{"openalex":"https://openalex.org/W7162552989","doi":"https://doi.org/10.48550/arxiv.2605.26396"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.26396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.26396","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137124072","display_name":"Cheng Qian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qian, Cheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137133788","display_name":"Hyeonjeong Ha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ha, Hyeonjeong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137113540","display_name":"Jiayu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiayu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137119723","display_name":"Jeonghwan Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Jeonghwan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038735910","display_name":"Emre Can Acikgoz","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Acikgoz, Emre Can","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137100399","display_name":"Bingxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Bingxuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101074704","display_name":"Kunlun Zhu","orcid":"https://orcid.org/0009-0009-9107-7401"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Kunlun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137108747","display_name":"Jiateng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiateng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125910190","display_name":"Aditi Tiwari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tiwari, Aditi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137105844","display_name":"Zhenhailong Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Zhenhailong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137108063","display_name":"Xiusi Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xiusi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026771837","display_name":"Mahdi Namazifar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Namazifar, Mahdi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137129749","display_name":"Heng Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Heng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5152000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5152000188827515,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.05739999935030937,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.03319999948143959,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/hallucinating","display_name":"Hallucinating","score":0.8989999890327454},{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.5999000072479248},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5242999792098999},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4797999858856201},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4442000091075897},{"id":"https://openalex.org/keywords/grounded-theory","display_name":"Grounded theory","score":0.4440999925136566},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.4025000035762787},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.3504999876022339}],"concepts":[{"id":"https://openalex.org/C2911011789","wikidata":"https://www.wikidata.org/wiki/Q130741","display_name":"Hallucinating","level":2,"score":0.8989999890327454},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6029999852180481},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.5999000072479248},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5335000157356262},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5242999792098999},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4797999858856201},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4442000091075897},{"id":"https://openalex.org/C156325361","wikidata":"https://www.wikidata.org/wiki/Q1152864","display_name":"Grounded theory","level":3,"score":0.4440999925136566},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4059000015258789},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.4025000035762787},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37470000982284546},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.33070001006126404},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C2780695315","wikidata":"https://www.wikidata.org/wiki/Q3799040","display_name":"Unobservable","level":2,"score":0.28780001401901245},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.28279998898506165},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.27129998803138733},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2662999927997589},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2565000057220459},{"id":"https://openalex.org/C2776035091","wikidata":"https://www.wikidata.org/wiki/Q7928819","display_name":"Viewpoints","level":2,"score":0.2563000023365021},{"id":"https://openalex.org/C519536355","wikidata":"https://www.wikidata.org/wiki/Q21021151","display_name":"Repurposing","level":2,"score":0.2554999887943268},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2529999911785126}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.26396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.26396","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.26396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"multimodal":[1],"models":[2,115,195],"(LMMs)":[3],"have":[4],"rapidly":[5],"advanced":[6],"in":[7,24,45,51,71,88,167,201,231],"perception":[8],"and":[9,106,123,126,223,236,242],"reasoning;":[10],"however,":[11],"it":[12,40],"remains":[13,68],"unclear":[14],"whether":[15],"these":[16],"capabilities":[17],"generalize":[18],"to":[19,64,141,196,218],"discovering":[20],"visually":[21,89,125],"grounded":[22,128,152,166,200],"solutions":[23],"open-ended":[25],"environments,":[26],"beyond":[27],"pattern":[28],"recognition.":[29],"In":[30,207],"such":[31],"settings,":[32],"intelligence":[33],"requires":[34],"more":[35],"than":[36],"answering":[37],"well-posed":[38],"questions:":[39],"involves":[41],"identifying":[42],"how":[43,114],"elements":[44],"a":[46,81,97,185],"scene":[47],"can":[48],"be":[49],"repurposed":[50],"non-obvious":[52],"yet":[53],"physically":[54,91,127],"feasible":[55],"ways.":[56],"This":[57],"form":[58],"of":[59,103,113,143],"creative":[60,85,181],"problem-solving":[61],"is":[62],"central":[63],"human":[65],"intelligence,":[66],"but":[67,146],"largely":[69],"untested":[70],"current":[72,134],"benchmarks.":[73],"To":[74],"evaluate":[75],"this":[76,172],"ability,":[77],"we":[78,175,193,209],"introduce":[79],"MM-CreativityBench,":[80],"benchmark":[82],"for":[83],"affordance-grounded":[84,177],"tool":[86,182],"use":[87,183],"rich,":[90],"constrained":[92],"environments.":[93],"Each":[94],"instance":[95],"presents":[96],"scenario":[98],"image":[99],"with":[100],"structured":[101],"views":[102],"candidate":[104],"entities":[105,235],"their":[107],"parts,":[108,161,237],"enabling":[109],"fine-grained,":[110],"interactive":[111],"evaluation":[112],"iteratively":[116],"inspect":[117],"the":[118,168,233],"scene,":[119],"identify":[120],"relevant":[121,157],"affordances,":[122],"compose":[124],"solutions.":[129],"Our":[130,226],"experiments":[131],"show":[132,228],"that":[133],"LMMs":[135],"often":[136,155],"fall":[137],"short,":[138],"not":[139,150,165],"due":[140],"lack":[142],"generative":[144],"capability,":[145],"because":[147],"they":[148],"do":[149],"sustain":[151],"exploration.":[153],"Models":[154],"overlook":[156],"entities,":[158],"under-examine":[159],"critical":[160],"or":[162],"hallucinate":[163],"attributes":[164],"image.":[169],"Motivated":[170],"by":[171],"failure":[173],"mode,":[174],"propose":[176],"alignment,":[178],"which":[179],"casts":[180],"as":[184],"preference":[186],"learning":[187],"problem.":[188],"Using":[189],"Direct":[190],"Preference":[191],"Optimization,":[192],"encourage":[194],"prefer":[197],"attribute-affordance":[198],"reasoning":[199],"visual":[202],"evidence":[203],"over":[204],"hallucinated":[205],"alternatives.":[206],"addition,":[208],"incorporate":[210],"supervision":[211],"derived":[212],"from":[213],"an":[214],"affordance":[215],"knowledge":[216],"base":[217],"guide":[219],"broader":[220],"entity":[221],"exploration":[222],"multi-turn":[224],"planning.":[225],"results":[227],"consistent":[229],"gains":[230],"selecting":[232],"correct":[234],"while":[238],"substantially":[239],"reducing":[240],"hallucination":[241],"grounding-related":[243],"errors.":[244]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-28T00:00:00"}
