{"id":"https://openalex.org/W4414360108","doi":"https://doi.org/10.24963/ijcai.2025/85","title":"MonoMixer: Marrying Convolution and Vision Transformer for Efficient Self-Supervised Monocular Depth Estimation","display_name":"MonoMixer: Marrying Convolution and Vision Transformer for Efficient Self-Supervised Monocular Depth Estimation","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https://openalex.org/W4414360108","doi":"https://doi.org/10.24963/ijcai.2025/85"},"language":"en","primary_location":{"id":"doi:10.24963/ijcai.2025/85","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/85","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103167168","display_name":"Zhiyong Chang","orcid":"https://orcid.org/0000-0003-2439-406X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhiyong Chang","raw_affiliation_strings":["Peking University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100322682","display_name":"Yan Wang","orcid":"https://orcid.org/0000-0002-3984-6973"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yan Wang","raw_affiliation_strings":["Zhejiang University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhejiang University","institution_ids":["https://openalex.org/I76130692"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5103167168"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34436611,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"756","last_page":"764"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13114","display_name":"Image Processing Techniques and Applications","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13114","display_name":"Image Processing Techniques and Applications","score":0.9896000027656555,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12111","display_name":"Industrial Vision Systems and Defect Detection","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10638","display_name":"Optical measurement and interference techniques","score":0.9837999939918518,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.7002999782562256},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.5501000285148621},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4812999963760376},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.47380000352859497},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.43700000643730164},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.41850000619888306},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.39590001106262207},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.39469999074935913}],"concepts":[{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.7002999782562256},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6743000149726868},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6449000239372253},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.5501000285148621},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4812999963760376},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.47380000352859497},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.44200000166893005},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.43700000643730164},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.41850000619888306},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.39590001106262207},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.39469999074935913},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.37400001287460327},{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.34470000863075256},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.32749998569488525},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.32199999690055847},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C158829959","wikidata":"https://www.wikidata.org/wiki/Q1640606","display_name":"Monocular vision","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C5339829","wikidata":"https://www.wikidata.org/wiki/Q1425977","display_name":"Machine vision","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963/ijcai.2025/85","is_oa":false,"landing_page_url":"https://doi.org/10.24963/ijcai.2025/85","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirty-Fourth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Self-supervised":[0],"monocular":[1,48,81],"depth":[2,9,49,78,125],"estimation":[3],"that":[4],"does":[5],"not":[6],"require":[7],"hard-to-source":[8],"labels":[10],"for":[11,35],"training":[12],"has":[13,72],"been":[14,33],"widely":[15],"studied":[16],"in":[17,47,76,86,222],"recent":[18],"years.":[19],"Due":[20],"to":[21,59,92,119,149,179],"its":[22,44],"significant":[23],"and":[24,154,176,198,201,210],"growing":[25],"needs,":[26],"many":[27],"lightweight":[28,103],"but":[29],"effective":[30],"architectures":[31],"have":[32,42],"designed":[34],"edge":[36,93],"devices.":[37,94],"Convolutional":[38],"Neural":[39],"Networks":[40],"(CNNs)":[41],"shown":[43],"extraordinary":[45],"ability":[46],"estimation.":[50],"However,":[51],"their":[52],"limited":[53],"receptive":[54],"field":[55],"stints":[56],"existing":[57],"methods":[58],"reason":[60],"only":[61],"locally,":[62],"inhibiting":[63],"the":[64,67,87,90,142,151,188],"effectiveness":[65],"of":[66,131,145,159,190],"self-supervised":[68],"paradigm.":[69],"Recently,":[70],"Transformers":[71,88],"achieved":[73],"great":[74],"success":[75],"estimating":[77],"maps":[79,158],"from":[80],"images.":[82],"Nevertheless,":[83],"massive":[84],"parameters":[85],"hinder":[89],"deployment":[91],"In":[95],"this":[96],"paper,":[97],"we":[98],"propose":[99],"MonoMixer,":[100],"a":[101,128,219],"brand-new":[102],"CNN-Transformer":[104],"architecture":[105],"with":[106,224],"three":[107,206],"main":[108],"contributions:":[109],"1)":[110],"The":[111,134,163],"details-augmented":[112],"(DA)":[113],"block":[114,139,167],"employs":[115],"graph":[116],"reasoning":[117],"unit":[118],"capture":[120],"abundant":[121],"local":[122,174],"details,":[123],"resulting":[124],"prediction":[126],"at":[127,195],"higher":[129],"level":[130],"precision.":[132],"2)":[133],"self-modulate":[135],"channel":[136,143],"attention":[137],"(SMCA)":[138],"adaptively":[140],"adjust":[141],"weights":[144],"feature":[146,157],"maps,":[147],"aiming":[148],"emphasize":[150],"crucial":[152],"features":[153,175],"aggregate":[155],"channel-wise":[156],"different":[160],"patterns.":[161],"3)":[162],"global-guided":[164],"Transformer":[165],"(G2T)":[166],"integrates":[168],"global":[169],"semantic":[170],"token":[171],"into":[172],"multi-scale":[173],"exploit":[177],"cross-attention":[178],"encode":[180],"long":[181],"range":[182],"dependencies.":[183],"Furthermore,":[184],"experimental":[185],"results":[186],"demonstrate":[187],"superiority":[189],"our":[191,213],"proposed":[192,214],"MonoMixer":[193,215],"both":[194],"model":[196],"size":[197],"inference":[199],"speed,":[200],"achieve":[202],"state-of-the-art":[203],"performance":[204],"on":[205],"datasets:":[207],"KITTI,":[208],"Make3D":[209],"Cityscapes.":[211],"Specifically,":[212],"outperforms":[216],"MonoFormer":[217],"by":[218],"large":[220],"margin":[221],"accuracy,":[223],"about":[225],"95":[226],"%":[227],"fewer":[228],"parameters.":[229]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
