{"id":"https://openalex.org/W7137874744","doi":"https://doi.org/10.48550/arxiv.2603.13910","title":"Scene Generation at Absolute Scale: Utilizing Semantic and Geometric Guidance From Text for Accurate and Interpretable 3D Indoor Scene Generation","display_name":"Scene Generation at Absolute Scale: Utilizing Semantic and Geometric Guidance From Text for Accurate and Interpretable 3D Indoor Scene Generation","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137874744","doi":"https://doi.org/10.48550/arxiv.2603.13910"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13910","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13910","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13910","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005751157","display_name":"Stefan Ainetter","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ainetter, Stefan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119788737","display_name":"Thomas Deixelberger","orcid":"https://orcid.org/0009-0006-2509-5191"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deixelberger, Thomas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077936456","display_name":"Edoardo Abi Haila Dominici","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dominici, Edoardo A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129686304","display_name":"Philipp Drescher","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Drescher, Philipp","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076845118","display_name":"Konstantinos Vardis","orcid":"https://orcid.org/0000-0003-2282-4644"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vardis, Konstantinos","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5014594342","display_name":"Markus Steinberger","orcid":"https://orcid.org/0000-0001-5977-8536"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Steinberger, Markus","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5005751157"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.2768999934196472,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.2768999934196472,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.2345000058412552,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10719","display_name":"3D Shape Modeling and Analysis","score":0.15870000422000885,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/gaussian","display_name":"Gaussian","score":0.4880000054836273},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.48660001158714294},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.45179998874664307},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4207000136375427},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4172999858856201},{"id":"https://openalex.org/keywords/absolute-scale","display_name":"Absolute scale","score":0.3621000051498413},{"id":"https://openalex.org/keywords/3d-city-models","display_name":"3D city models","score":0.35269999504089355},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.35019999742507935}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7487000226974487},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6578999757766724},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6570000052452087},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.4880000054836273},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.48660001158714294},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.45179998874664307},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4207000136375427},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4172999858856201},{"id":"https://openalex.org/C2779265950","wikidata":"https://www.wikidata.org/wiki/Q22906137","display_name":"Absolute scale","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C2778597888","wikidata":"https://www.wikidata.org/wiki/Q172169","display_name":"3D city models","level":3,"score":0.35269999504089355},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.35019999742507935},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3310000002384186},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2971000075340271},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.29679998755455017},{"id":"https://openalex.org/C2987819851","wikidata":"https://www.wikidata.org/wiki/Q191839","display_name":"Aerial imagery","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C108882727","wikidata":"https://www.wikidata.org/wiki/Q2991685","display_name":"Solid modeling","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2856000065803528},{"id":"https://openalex.org/C117455697","wikidata":"https://www.wikidata.org/wiki/Q190149","display_name":"Photogrammetry","level":2,"score":0.28060001134872437},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.273499995470047},{"id":"https://openalex.org/C109950114","wikidata":"https://www.wikidata.org/wiki/Q4464732","display_name":"3D reconstruction","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.25119999051094055}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13910","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13910","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13910","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13910","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Sustainable cities and communities","id":"https://metadata.un.org/sdg/11","score":0.462261825799942}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"GuidedSceneGen,":[2],"a":[3,46,52,65,97,133,167],"text-to-3D":[4,181],"generation":[5,42],"framework":[6],"that":[7,22,106],"produces":[8],"metrically":[9],"accurate,":[10],"globally":[11],"consistent,":[12],"and":[13,59,73,109,135,150,157,166,174],"semantically":[14],"interpretable":[15],"indoor":[16],"scenes.":[17],"Unlike":[18],"prior":[19],"text-driven":[20],"methods":[21],"often":[23],"suffer":[24],"from":[25,45,153],"geometric":[26,60],"drift":[27],"or":[28],"scale":[29],"ambiguity,":[30],"our":[31],"approach":[32],"maintains":[33],"an":[34],"absolute":[35,141],"world":[36],"coordinate":[37],"frame":[38],"throughout":[39],"the":[40,84],"entire":[41],"process.":[43],"Starting":[44],"textual":[47],"scene":[48,139,160],"description,":[49],"we":[50,95],"predict":[51],"global":[53,85],"3D":[54,129,138,172],"layout":[55,154,175],"encoding":[56],"both":[57],"semantic":[58,151],"structure,":[61],"which":[62],"serves":[63],"as":[64],"guiding":[66],"proxy":[67],"for":[68],"downstream":[69],"stages.":[70],"A":[71],"semantics-":[72],"depth-conditioned":[74],"panoramic":[75,180],"diffusion":[76,99],"model":[77,100],"then":[78],"synthesizes":[79],"360\u00b0":[80],"imagery":[81],"aligned":[82],"with":[83],"layout,":[86],"substantially":[87],"improving":[88],"spatial":[89],"coherence.":[90],"To":[91],"explore":[92],"unobserved":[93],"regions,":[94],"employ":[96],"video":[98],"guided":[101],"by":[102],"optimized":[103],"camera":[104],"trajectories":[105],"balances":[107],"coverage":[108],"collision":[110],"avoidance,":[111],"achieving":[112],"up":[113],"to":[114,119,155,178],"10x":[115],"faster":[116],"sampling":[117],"compared":[118,177],"exhaustive":[120],"path":[121],"exploration.":[122],"The":[123],"generated":[124],"views":[125],"are":[126],"fused":[127],"using":[128],"Gaussian":[130],"Splatting,":[131],"yielding":[132],"consistent":[134],"fully":[136],"navigable":[137],"in":[140],"scale.":[142],"GuidedSceneGen":[143],"enables":[144],"accurate":[145],"transfer":[146],"of":[147],"object":[148],"poses":[149],"labels":[152],"reconstruction,":[156],"supports":[158],"progressive":[159],"expansion":[161],"without":[162],"re-alignment.":[163],"Quantitative":[164],"results":[165],"user":[168],"study":[169],"demonstrate":[170],"greater":[171],"consistency":[173],"plausibility":[176],"recent":[179],"baselines.":[182]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
