{"id":"https://openalex.org/W4415538348","doi":"https://doi.org/10.1145/3746027.3755409","title":"OGDepth: Leveraging Object Guidance in Diffusion Models for Enhanced Monocular Depth Estimation","display_name":"OGDepth: Leveraging Object Guidance in Diffusion Models for Enhanced Monocular Depth Estimation","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415538348","doi":"https://doi.org/10.1145/3746027.3755409"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3755409","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755409","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111821444","display_name":"Wenzheng Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenzheng Yang","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028111174","display_name":"Songwei Pei","orcid":"https://orcid.org/0000-0002-7926-5727"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Songwei Pei","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040065558","display_name":"Bingfeng Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bingfeng Liu","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100340569","display_name":"Qian Li","orcid":"https://orcid.org/0000-0002-1612-4644"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qian Li","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5054814598","display_name":"Shangguang Wang","orcid":"https://orcid.org/0000-0001-7245-1298"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shangguang Wang","raw_affiliation_strings":["School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Beijing University of Posts and Telecommunications, Beijing, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5111821444"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.32414221,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"4378","last_page":"4387"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11105","display_name":"Advanced Image Processing Techniques","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10638","display_name":"Optical measurement and interference techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.6503999829292297},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.5824999809265137},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5338000059127808},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.48159998655319214},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4668000042438507},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.45190000534057617},{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.4490000009536743},{"id":"https://openalex.org/keywords/mutual-information","display_name":"Mutual information","score":0.4226999878883362},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3873000144958496}],"concepts":[{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6945000290870667},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6902999877929688},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.6503999829292297},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5942999720573425},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.5824999809265137},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5338000059127808},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.48159998655319214},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4668000042438507},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.45190000534057617},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.4490000009536743},{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.4226999878883362},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.38190001249313354},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.374099999666214},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.36480000615119934},{"id":"https://openalex.org/C139945424","wikidata":"https://www.wikidata.org/wiki/Q1940696","display_name":"Mean squared error","level":2,"score":0.3555999994277954},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.35269999504089355},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C163294075","wikidata":"https://www.wikidata.org/wiki/Q581861","display_name":"Noise reduction","level":2,"score":0.3215999901294708},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.31380000710487366},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3109999895095825},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3018999993801117},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2734000086784363},{"id":"https://openalex.org/C2780186347","wikidata":"https://www.wikidata.org/wiki/Q11414","display_name":"Subnetwork","level":2,"score":0.25769999623298645}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3755409","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3755409","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W2110917409","https://openalex.org/W2115579991","https://openalex.org/W2132947399","https://openalex.org/W2948647700","https://openalex.org/W2963488291","https://openalex.org/W3173727695","https://openalex.org/W4214520160","https://openalex.org/W4302275500","https://openalex.org/W4308236200","https://openalex.org/W4312819733","https://openalex.org/W4312933868","https://openalex.org/W4319300976","https://openalex.org/W4386083069","https://openalex.org/W4386319016","https://openalex.org/W4390871795","https://openalex.org/W4390872059","https://openalex.org/W4390872147","https://openalex.org/W4390873429","https://openalex.org/W4390873622","https://openalex.org/W4390874575","https://openalex.org/W4399938885","https://openalex.org/W4402727668","https://openalex.org/W4402727858","https://openalex.org/W4402753888"],"related_works":[],"abstract_inverted_index":{"Monocular":[0],"depth":[1,27,65,155],"estimation":[2,28,66],"stands":[3],"as":[4,40],"a":[5,62,152,160,206,235],"fundamental":[6],"pursuit":[7],"in":[8,51,119],"computer":[9],"vision.":[10],"Recently,":[11],"some":[12],"methods":[13,35],"have":[14],"attempted":[15],"to":[16,43,91,131,139,165,240],"introduce":[17],"the":[18,23,45,76,81,93,105,114,137,145,149,167,195,214,221,241],"text-to-image":[19],"diffusion":[20],"model":[21,138,199],"into":[22,97],"domain":[24],"of":[25,47,75,151,170,203,208,232],"monocular":[26,64],"and":[29,54,116,133,205,210],"achieved":[30],"impressive":[31],"results.":[32],"However,":[33],"these":[34],"typically":[36],"employ":[37,124],"pre-defined":[38],"templates":[39],"text":[41],"prompts":[42,70,98],"guide":[44],"learning":[46],"denoising":[48],"networks,":[49],"resulting":[50],"limited":[52],"flexibility":[53],"scalability.":[55],"In":[56],"this":[57],"paper,":[58],"we":[59,84,123,158],"propose":[60],"OGDepth,":[61],"diffusion-based":[63],"network":[67],"with":[68,104,187],"object":[69,77,94,130],"generated":[71],"by":[72],"taking":[73],"advantage":[74],"detection":[78,95],"information":[79,96,111,127,143],"from":[80],"scene.":[82,146],"Specifically,":[83],"design":[85,159],"an":[86,201,229],"Object":[87],"Prompt":[88],"Module":[89],"(OPM)":[90],"encode":[92],"that":[99],"are":[100,213],"more":[101,153],"closely":[102],"aligned":[103],"image":[106],"content,":[107],"offering":[108],"richer":[109],"contextual":[110],"while":[112],"circumventing":[113],"monotony":[115],"redundancy":[117],"inherent":[118],"template-generated":[120],"prompts.":[121],"Moreover,":[122],"bounding":[125],"box":[126],"for":[128],"each":[129],"filter":[132],"localize":[134],"objects,":[135],"enabling":[136,175],"grasp":[140],"relative":[141],"positional":[142],"within":[144],"This":[147],"facilitates":[148],"creation":[150],"precise":[154],"map.":[156],"Additionally,":[157],"Global-Local":[161],"Interaction":[162],"Decoder":[163],"(GLID)":[164],"facilitate":[166],"mutual":[168],"exchange":[169],"features":[171],"at":[172],"different":[173],"scales,":[174],"efficient":[176],"feature":[177],"fusion.":[178],"Our":[179],"approach":[180],"underwent":[181],"rigorous":[182],"experiments":[183],"across":[184],"multiple":[185],"datasets,":[186],"results":[188],"showcasing":[189],"its":[190],"state-of-the-art":[191],"performance.":[192],"Notably,":[193],"on":[194],"KITTI":[196],"dataset,":[197,225],"our":[198,226],"achieves":[200,228],"RMSE":[202,230],"1.967":[204],"REL":[207],"0.047,":[209],"both":[211],"metrics":[212],"best":[215],"among":[216],"all":[217],"compared":[218,239],"methods.":[219],"On":[220],"NYU":[222],"Depth":[223],"V2":[224],"method":[227,243],"score":[231],"0.221,":[233],"representing":[234],"notable":[236],"12.9%":[237],"enhancement":[238],"baseline":[242],"(VPD).":[244]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-25T00:00:00"}
