{"id":"https://openalex.org/W7083840163","doi":"https://doi.org/10.48550/arxiv.2509.24652","title":"Learning Object-Centric Representations Based on Slots in Real World Scenarios","display_name":"Learning Object-Centric Representations Based on Slots in Real World Scenarios","publication_year":2025,"publication_date":"2025-09-29","ids":{"openalex":"https://openalex.org/W7083840163","doi":"https://doi.org/10.48550/arxiv.2509.24652"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.24652","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24652","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.24652","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Akan, Adil Kaan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Akan, Adil Kaan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11274","display_name":"Botulinum Toxin and Related Neurological Disorders","score":0.10080000013113022,"subfield":{"id":"https://openalex.org/subfields/2728","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11274","display_name":"Botulinum Toxin and Related Neurological Disorders","score":0.10080000013113022,"subfield":{"id":"https://openalex.org/subfields/2728","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10497","display_name":"Fungal and yeast genetics research","score":0.09960000216960907,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10120","display_name":"Bacterial Genetics and Biotechnology","score":0.0674000009894371,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.6093000173568726},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.5327000021934509},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4837999939918518},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4659000039100647},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.43479999899864197},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.42820000648498535},{"id":"https://openalex.org/keywords/cognitive-neuroscience-of-visual-object-recognition","display_name":"Cognitive neuroscience of visual object recognition","score":0.3686999976634979},{"id":"https://openalex.org/keywords/prior-probability","display_name":"Prior probability","score":0.35440000891685486}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7817000150680542},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6306999921798706},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.6093000173568726},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.5327000021934509},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49000000953674316},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4837999939918518},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4659000039100647},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.43479999899864197},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.42820000648498535},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.3686999976634979},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.35440000891685486},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.3409999907016754},{"id":"https://openalex.org/C190470478","wikidata":"https://www.wikidata.org/wiki/Q2370229","display_name":"Invariant (physics)","level":2,"score":0.32839998602867126},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C184408114","wikidata":"https://www.wikidata.org/wiki/Q1502022","display_name":"Generative Design","level":3,"score":0.31189998984336853},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.289000004529953},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2703999876976013},{"id":"https://openalex.org/C2780103172","wikidata":"https://www.wikidata.org/wiki/Q1309721","display_name":"Visual Objects","level":3,"score":0.2685999870300293},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2678000032901764},{"id":"https://openalex.org/C20894473","wikidata":"https://www.wikidata.org/wiki/Q1116105","display_name":"Object model","level":3,"score":0.26499998569488525},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2612000107765198}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.24652","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24652","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.24652","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.24652","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"A":[0],"central":[1],"goal":[2],"in":[3,112,159,218],"AI":[4],"is":[5],"to":[6,127,134,190],"represent":[7],"scenes":[8],"as":[9,172],"compositions":[10],"of":[11],"discrete":[12],"objects,":[13,104],"enabling":[14],"fine-grained,":[15],"controllable":[16,119],"image":[17,120],"and":[18,28,100,108,118,140,151,164,166,176,187,196,203,214,221],"video":[19,161],"generation.":[20,121],"Yet":[21],"leading":[22],"diffusion":[23,48,92],"models":[24,49,93],"treat":[25],"images":[26,195],"holistically":[27],"rely":[29],"on":[30],"text":[31],"conditioning,":[32],"creating":[33],"a":[34,42,60,95,141,185],"mismatch":[35],"for":[36,50,98,103,194,211],"object-level":[37],"editing.":[38],"This":[39,155],"thesis":[40],"introduces":[41],"framework":[43,126],"that":[44],"adapts":[45],"powerful":[46],"pretrained":[47,78],"object-centric":[51,191],"synthesis":[52],"while":[53,84],"retaining":[54],"their":[55,81],"generative":[56,192,216],"capacity.":[57],"We":[58,122],"identify":[59],"core":[61],"challenge:":[62],"balancing":[63],"global":[64],"scene":[65],"coherence":[66],"with":[67,94],"disentangled":[68],"object":[69,113,136,149,162,173],"control.":[70],"Our":[71],"method":[72],"integrates":[73],"lightweight,":[74],"slot-based":[75],"conditioning":[76],"into":[77],"models,":[79],"preserving":[80],"visual":[82],"priors":[83],"providing":[85],"object-specific":[86],"manipulation.":[87],"For":[88],"images,":[89],"SlotAdapt":[90],"augments":[91],"register":[96],"token":[97],"background/style":[99],"slot-conditioned":[101],"modules":[102],"reducing":[105],"text-conditioning":[106],"bias":[107],"achieving":[109],"state-of-the-art":[110],"results":[111],"discovery,":[114],"segmentation,":[115],"compositional":[116],"editing,":[117],"further":[123],"extend":[124],"the":[125,208],"video.":[128],"Using":[129],"Invariant":[130],"Slot":[131],"Attention":[132],"(ISA)":[133],"separate":[135],"identity":[137],"from":[138],"pose":[139],"Transformer-based":[142],"temporal":[143],"aggregator,":[144],"our":[145],"approach":[146,189],"maintains":[147],"consistent":[148],"representations":[150],"dynamics":[152],"across":[153],"frames.":[154],"yields":[156],"new":[157],"benchmarks":[158],"unsupervised":[160],"segmentation":[163],"reconstruction,":[165],"supports":[167],"advanced":[168],"editing":[169],"tasks":[170],"such":[171],"removal,":[174],"replacement,":[175],"insertion":[177],"without":[178],"explicit":[179],"supervision.":[180],"Overall,":[181],"this":[182],"work":[183],"establishes":[184],"general":[186],"scalable":[188],"modeling":[193],"videos.":[197],"By":[198],"bridging":[199],"human":[200],"object-based":[201],"perception":[202],"machine":[204],"learning,":[205],"it":[206],"expands":[207],"design":[209],"space":[210],"interactive,":[212],"structured,":[213],"user-driven":[215],"tools":[217],"creative,":[219],"scientific,":[220],"practical":[222],"domains.":[223]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
