{"id":"https://openalex.org/W7131372823","doi":"https://doi.org/10.48550/arxiv.2602.19974","title":"RL-RIG: A Generative Spatial Reasoner via Intrinsic Reflection","display_name":"RL-RIG: A Generative Spatial Reasoner via Intrinsic Reflection","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131372823","doi":"https://doi.org/10.48550/arxiv.2602.19974"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.19974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.19974","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126827879","display_name":"Tianyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Tianyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126805333","display_name":"Zhiyuan Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Zhiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126794115","display_name":"Qian Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126798356","display_name":"Xinyi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Xinyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104089217","display_name":"Xinwei Long","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Long, Xinwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126795845","display_name":"Bowen Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Bowen","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5126827879"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7325999736785889,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.7325999736785889,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1459999978542328,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.008799999952316284,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/intuition","display_name":"Intuition","score":0.6590999960899353},{"id":"https://openalex.org/keywords/semantic-reasoner","display_name":"Semantic reasoner","score":0.6424000263214111},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.5508999824523926},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5353999733924866},{"id":"https://openalex.org/keywords/spatial-intelligence","display_name":"Spatial intelligence","score":0.5228999853134155},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.505299985408783},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.42910000681877136},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.36239999532699585}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7038999795913696},{"id":"https://openalex.org/C132010649","wikidata":"https://www.wikidata.org/wiki/Q189222","display_name":"Intuition","level":2,"score":0.6590999960899353},{"id":"https://openalex.org/C9616225","wikidata":"https://www.wikidata.org/wiki/Q3929429","display_name":"Semantic reasoner","level":2,"score":0.6424000263214111},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.637499988079071},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.5508999824523926},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5353999733924866},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.5228999853134155},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.505299985408783},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4171999990940094},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C55020928","wikidata":"https://www.wikidata.org/wiki/Q3813865","display_name":"Image quality","level":3,"score":0.3528999984264374},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.34119999408721924},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C2989087649","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Image synthesis","level":3,"score":0.2987000048160553},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.2768000066280365},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25929999351501465},{"id":"https://openalex.org/C65682993","wikidata":"https://www.wikidata.org/wiki/Q1056451","display_name":"Reflection (computer programming)","level":2,"score":0.2524000108242035},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.19974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.19974","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.19974","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,9,84,178,186],"image":[3,15,85,119,187],"generation":[4,16,86,99],"have":[5],"achieved":[6],"impressive":[7],"results":[8,165],"producing":[10],"high-quality":[11],"images.":[12],"However,":[13],"existing":[14,170],"models":[17,173],"still":[18],"generally":[19],"struggle":[20],"with":[21,42,95],"a":[22,52,73,122,150],"spatial":[23,33,142,156,184],"reasoning":[24,82,185],"dilemma,":[25,48],"lacking":[26],"the":[27,36,78,89,93,107,114,155],"ability":[28,83],"to":[29,76,105,153,176],"accurately":[30],"capture":[31],"fine-grained":[32],"relationships":[34],"from":[35],"prompt":[37],"and":[38,69,113,148,182],"correctly":[39],"generate":[40],"scenes":[41],"structural":[43],"integrity.":[44],"To":[45,91],"mitigate":[46],"this":[47],"we":[49,101],"propose":[50],"RL-RIG,":[51],"Reinforcement":[53],"Learning":[54],"framework":[55],"for":[56,87,110,117],"Reflection-based":[57],"Image":[58,115],"Generation.":[59],"Our":[60],"architecture":[61],"comprises":[62],"four":[63],"primary":[64],"components:":[65],"Diffuser,":[66,71],"Checker,":[67],"Actor,":[68],"Inverse":[70],"following":[72],"Generate-Reflect-Edit":[74],"paradigm":[75],"spark":[77],"Chain":[79],"of":[80,158,180],"Thought":[81],"addressing":[88],"dilemma.":[90],"equip":[92],"model":[94],"better":[96,118],"intuition":[97],"over":[98],"trajectories,":[100],"further":[102],"develop":[103],"Reflection-GRPO":[104],"train":[106],"VLM":[108],"Actor":[109],"edit":[111],"prompts":[112],"Editor":[116],"quality":[120],"under":[121],"given":[123],"prompt,":[124],"respectively.":[125],"Unlike":[126],"traditional":[127],"approaches":[128],"that":[129,167],"solely":[130],"produce":[131],"visually":[132],"stunning":[133],"yet":[134],"structurally":[135],"unreasonable":[136],"content,":[137],"our":[138],"evaluation":[139],"metrics":[140],"prioritize":[141],"accuracy,":[143],"utilizing":[144],"Scene":[145],"Graph":[146],"IoU":[147],"employing":[149],"VLM-as-a-Judge":[151],"strategy":[152],"assess":[154],"consistency":[157],"generated":[159],"images":[160],"on":[161],"LAION-SG":[162],"dataset.":[163],"Experimental":[164],"show":[166],"RL-RIG":[168],"outperforms":[169],"state-of-the-art":[171],"open-source":[172],"by":[174],"up":[175],"11%":[177],"terms":[179],"controllable":[181],"precise":[183],"generation.":[188]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
