{"id":"https://openalex.org/W4401307285","doi":"https://doi.org/10.48550/arxiv.2408.00438","title":"MonoMM: A Multi-scale Mamba-Enhanced Network for Real-time Monocular 3D Object Detection","display_name":"MonoMM: A Multi-scale Mamba-Enhanced Network for Real-time Monocular 3D Object Detection","publication_year":2024,"publication_date":"2024-08-01","ids":{"openalex":"https://openalex.org/W4401307285","doi":"https://doi.org/10.48550/arxiv.2408.00438"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2408.00438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.00438","pdf_url":"https://arxiv.org/pdf/2408.00438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2408.00438","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5074122651","display_name":"Youjia Fu","orcid":"https://orcid.org/0000-0003-4345-0370"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Fu, Youjia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090810000","display_name":"Zihao Xu","orcid":"https://orcid.org/0000-0003-0805-8533"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011916209","display_name":"Junsong Fu","orcid":"https://orcid.org/0000-0002-2445-1987"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Junsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006395921","display_name":"Huixia Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Huixia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022408219","display_name":"Shuqiu Tan","orcid":"https://orcid.org/0000-0001-5336-7453"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Shuqiu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100440266","display_name":"Lei Li","orcid":"https://orcid.org/0000-0001-9703-2102"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5074122651"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12111","display_name":"Industrial Vision Systems and Defect Detection","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/2209","display_name":"Industrial and Manufacturing Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.6166768670082092},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5990777015686035},{"id":"https://openalex.org/keywords/monocular","display_name":"Monocular","score":0.5503055453300476},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.48642706871032715},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.34405767917633057},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32592710852622986},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.20354855060577393},{"id":"https://openalex.org/keywords/cartography","display_name":"Cartography","score":0.10267698764801025}],"concepts":[{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.6166768670082092},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5990777015686035},{"id":"https://openalex.org/C65909025","wikidata":"https://www.wikidata.org/wiki/Q1945033","display_name":"Monocular","level":2,"score":0.5503055453300476},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.48642706871032715},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34405767917633057},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32592710852622986},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.20354855060577393},{"id":"https://openalex.org/C58640448","wikidata":"https://www.wikidata.org/wiki/Q42515","display_name":"Cartography","level":1,"score":0.10267698764801025}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2408.00438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.00438","pdf_url":"https://arxiv.org/pdf/2408.00438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2408.00438","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2408.00438","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2408.00438","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2408.00438","pdf_url":"https://arxiv.org/pdf/2408.00438","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320327639","display_name":"Centre Scientifique et Technique du B\u00e2timent","ror":"https://ror.org/02fsd1928"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4401307285.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Recent":[0],"advancements":[1],"in":[2,13,36],"transformer-based":[3],"monocular":[4,52,216],"3D":[5,15,53,71],"object":[6,54,72],"detection":[7,55],"techniques":[8],"have":[9],"exhibited":[10],"exceptional":[11],"performance":[12,40,175],"inferring":[14],"attributes":[16],"from":[17,98,139],"single":[18],"2D":[19],"images.":[20],"However,":[21],"most":[22],"existing":[23],"methods":[24,217],"rely":[25],"on":[26,91,206],"resource-intensive":[27],"transformer":[28],"architectures,":[29],"which":[30,89],"often":[31],"lead":[32],"to":[33,121,150,199],"significant":[34],"drops":[35],"computational":[37,103],"efficiency":[38],"and":[39,50,94,119,144,155,180,192,196,218],"when":[41],"handling":[42],"long":[43],"sequence":[44],"data.":[45],"To":[46],"address":[47],"these":[48],"challenges":[49],"advance":[51],"technology,":[56],"we":[57],"propose":[58],"an":[59],"innovative":[60],"network":[61,67],"architecture,":[62],"MonoMM,":[63],"a":[64,146],"Multi-scale":[65],"\\textbf{M}amba-Enhanced":[66],"for":[68],"real-time":[69,220],"Monocular":[70],"detection.":[73,221],"This":[74,158],"well-designed":[75],"architecture":[76],"primarily":[77],"includes":[78],"the":[79,109,112,116,136,165,173,184,207],"following":[80],"two":[81],"core":[82],"modules:":[83],"Focused":[84],"Multi-Scale":[85],"Fusion":[86],"(FMF)":[87],"Module,":[88],"focuses":[90],"effectively":[92],"preserving":[93],"fusing":[95],"image":[96,126,140],"information":[97,110,154],"different":[99,177],"scales":[100],"with":[101],"lower":[102],"resource":[104],"consumption.":[105],"By":[106],"precisely":[107],"regulating":[108],"flow,":[111],"FMF":[113],"module":[114],"enhances":[115,172],"model":[117,174],"adaptability":[118],"robustness":[120],"scale":[122],"variations":[123],"while":[124],"maintaining":[125],"details.":[127],"Depth-Aware":[128],"Feature":[129],"Enhancement":[130],"Mamba":[131],"(DMB)":[132],"Module:":[133],"It":[134],"utilizes":[135],"fused":[137],"features":[138],"characteristics":[141],"as":[142],"input":[143],"employs":[145],"novel":[147],"adaptive":[148],"strategy":[149,161],"globally":[151],"integrate":[152],"depth":[153,159,168],"visual":[156],"information.":[157],"fusion":[160],"not":[162],"only":[163],"improves":[164],"accuracy":[166],"of":[167,187],"estimation":[169],"but":[170],"also":[171],"under":[176],"viewing":[178],"angles":[179],"environmental":[181],"conditions.":[182],"Moreover,":[183],"modular":[185],"design":[186],"MonoMM":[188],"provides":[189],"high":[190],"flexibility":[191],"scalability,":[193],"facilitating":[194],"adjustments":[195],"optimizations":[197],"according":[198],"specific":[200],"application":[201],"needs.":[202],"Extensive":[203],"experiments":[204],"conducted":[205],"KITTI":[208],"dataset":[209],"show":[210],"that":[211],"our":[212],"method":[213],"outperforms":[214],"previous":[215],"achieves":[219]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2024-08-04T00:00:00"}
