{"id":"https://openalex.org/W7133317682","doi":"https://doi.org/10.48550/arxiv.2603.01586","title":"InterCoG: Towards Spatially Precise Image Editing with Interleaved Chain-of-Grounding Reasoning","display_name":"InterCoG: Towards Spatially Precise Image Editing with Interleaved Chain-of-Grounding Reasoning","publication_year":2026,"publication_date":"2026-03-02","ids":{"openalex":"https://openalex.org/W7133317682","doi":"https://doi.org/10.48550/arxiv.2603.01586"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.01586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.01586","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5122572292","display_name":"Yecong Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wan, Yecong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127889820","display_name":"Fan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128002831","display_name":"Chunwei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chunwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127911811","display_name":"Hao Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113786812","display_name":"Ming-Wen Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Mingwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127895442","display_name":"Wangmeng Zuo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuo, Wangmeng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5122572292"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5116000175476074,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5116000175476074,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.09600000083446503,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.07259999960660934,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.7559000253677368},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6313999891281128},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5590000152587891},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.5300999879837036},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.527999997138977},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5138000249862671},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.49140000343322754},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.44839999079704285}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8305000066757202},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.7559000253677368},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6313999891281128},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5805000066757202},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5590000152587891},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.5300999879837036},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.527999997138977},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5138000249862671},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.49140000343322754},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.44839999079704285},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.42309999465942383},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.3887999951839447},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3783999979496002},{"id":"https://openalex.org/C155911833","wikidata":"https://www.wikidata.org/wiki/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.35760000348091125},{"id":"https://openalex.org/C25343380","wikidata":"https://www.wikidata.org/wiki/Q277521","display_name":"Relation (database)","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.33009999990463257},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.3276999890804291},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3163999915122986},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.3095000088214874},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.29510000348091125},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.258899986743927},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25760000944137573},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2547999918460846},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.01586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.01586","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.01586","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Emerging":[0],"unified":[1],"editing":[2,11,22,55,99,114,159,169],"models":[3],"have":[4],"demonstrated":[5],"strong":[6],"capabilities":[7],"in":[8,23,56,107,179],"general":[9],"object":[10,69],"tasks.":[12],"However,":[13],"it":[14],"remains":[15],"a":[16,45,154],"significant":[17],"challenge":[18],"to":[19,66,80,116,141],"perform":[20,68],"fine-grained":[21,53],"complex":[24,57],"multi-entity":[25,187],"scenes,":[26],"particularly":[27],"those":[28],"where":[29],"targets":[30,100],"are":[31],"not":[32],"visually":[33],"salient":[34],"and":[35,85,105,110,136,146,165,186],"require":[36],"spatial":[37,77,143],"reasoning.":[38],"To":[39,121],"this":[40,124],"end,":[41],"we":[42,126],"propose":[43,127],"InterCoG,":[44],"novel":[46],"text-vision":[47],"Interleaved":[48],"Chain-of-Grounding":[49],"reasoning":[50,71,139,147,163],"framework":[51],"for":[52,167],"image":[54],"real-world":[58],"scenes.":[59,188],"The":[60],"key":[61],"insight":[62],"of":[63,87,176],"InterCoG":[64],"is":[65],"first":[67],"position":[70],"solely":[72],"within":[73],"text":[74],"that":[75],"includes":[76],"relation":[78],"details":[79],"explicitly":[81],"deduce":[82],"the":[83,88,98,113,118,174],"location":[84],"identity":[86],"edited":[89],"target.":[90],"It":[91],"then":[92],"conducts":[93],"visual":[94],"grounding":[95,133,138],"via":[96],"highlighting":[97],"with":[101,161],"generated":[102],"bounding":[103],"boxes":[104],"masks":[106],"pixel":[108],"space,":[109],"finally":[111],"rewrites":[112],"description":[115],"specify":[117],"intended":[119],"outcomes.":[120],"further":[122],"facilitate":[123],"paradigm,":[125],"two":[128],"auxiliary":[129],"training":[130],"modules:":[131],"multimodal":[132,137],"reconstruction":[134],"supervision":[135],"alignment":[140],"enforce":[142],"localization":[144],"accuracy":[145],"interpretability,":[148],"respectively.":[149],"We":[150],"also":[151],"construct":[152],"GroundEdit-45K,":[153],"dataset":[155],"comprising":[156],"45K":[157],"grounding-oriented":[158],"samples":[160],"detailed":[162],"annotations,":[164],"GroundEdit-Bench":[166],"grounding-aware":[168],"evaluation.":[170],"Extensive":[171],"experiments":[172],"substantiate":[173],"superiority":[175],"our":[177],"approach":[178],"highly":[180],"precise":[181],"edits":[182],"under":[183],"spatially":[184],"intricate":[185]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
