{"id":"https://openalex.org/W4416214895","doi":"https://doi.org/10.1109/tmm.2025.3632649","title":"Cas-OVD: Cascaded Open-Vocabulary Detection of Small Objects Using Multi-Refined Region Proposal Network in Autonomous Driving","display_name":"Cas-OVD: Cascaded Open-Vocabulary Detection of Small Objects Using Multi-Refined Region Proposal Network in Autonomous Driving","publication_year":2025,"publication_date":"2025-11-14","ids":{"openalex":"https://openalex.org/W4416214895","doi":"https://doi.org/10.1109/tmm.2025.3632649"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2025.3632649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3632649","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003049136","display_name":"Zhenyu Fang","orcid":"https://orcid.org/0000-0003-0120-4585"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhenyu Fang","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102861080","display_name":"Yulong Wu","orcid":"https://orcid.org/0009-0006-3249-1811"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yulong Wu","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084316677","display_name":"Jinchang Ren","orcid":"https://orcid.org/0000-0001-6116-3194"},"institutions":[{"id":"https://openalex.org/I4210122543","display_name":"Guangdong Polytechnic Normal University","ror":"https://ror.org/02pcb5m77","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210122543"]},{"id":"https://openalex.org/I522815984","display_name":"Robert Gordon University","ror":"https://ror.org/04f0qj703","country_code":"GB","type":"education","lineage":["https://openalex.org/I522815984"]}],"countries":["CN","GB"],"is_corresponding":false,"raw_author_name":"Jinchang Ren","raw_affiliation_strings":["School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China","National Subsea Centre, Robert Gordon University, Aberdeen, U.K"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Guangdong Polytechnic Normal University, Guangzhou, China","institution_ids":["https://openalex.org/I4210122543"]},{"raw_affiliation_string":"National Subsea Centre, Robert Gordon University, Aberdeen, U.K","institution_ids":["https://openalex.org/I522815984"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiangbin Zheng","orcid":"https://orcid.org/0000-0001-5249-2148"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangbin Zheng","raw_affiliation_strings":["School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Software, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yijun Yan","orcid":"https://orcid.org/0000-0003-0224-0078"},"institutions":[{"id":"https://openalex.org/I74872605","display_name":"China Southern Power Grid (China)","ror":"https://ror.org/03hkh9419","country_code":"CN","type":"company","lineage":["https://openalex.org/I74872605"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yijun Yan","raw_affiliation_strings":["Joint Laboratory of Power Remote Sensing Technology, Electric Power Research Institute, Yunnan Power Grid Company Ltd., Kunming, China"],"affiliations":[{"raw_affiliation_string":"Joint Laboratory of Power Remote Sensing Technology, Electric Power Research Institute, Yunnan Power Grid Company Ltd., Kunming, China","institution_ids":["https://openalex.org/I74872605"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100633239","display_name":"Lixiang Zhang","orcid":"https://orcid.org/0000-0001-5041-4453"},"institutions":[{"id":"https://openalex.org/I17145004","display_name":"Northwestern Polytechnical University","ror":"https://ror.org/01y0j0j86","country_code":"CN","type":"education","lineage":["https://openalex.org/I17145004"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lixiang Zhang","raw_affiliation_strings":["School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Northwestern Polytechnical University, Xi&#x0027;an, China","institution_ids":["https://openalex.org/I17145004"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5003049136"],"corresponding_institution_ids":["https://openalex.org/I17145004"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34065264,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"757","last_page":"771"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6128000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.6128000020980835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.32120001316070557,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.016300000250339508,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6291000247001648},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.6141999959945679},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.5730999708175659},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5597000122070312},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5450000166893005},{"id":"https://openalex.org/keywords/semantic-feature","display_name":"Semantic feature","score":0.5418000221252441},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4677000045776367},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.4530999958515167},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.45260000228881836},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4438000023365021}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8716999888420105},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6291000247001648},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.6141999959945679},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6074000000953674},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.5730999708175659},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5597000122070312},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5450000166893005},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.5418000221252441},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4677000045776367},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4593999981880188},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.4530999958515167},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.45260000228881836},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4438000023365021},{"id":"https://openalex.org/C34146451","wikidata":"https://www.wikidata.org/wiki/Q5048094","display_name":"Cascade","level":2,"score":0.4375999867916107},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.43160000443458557},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.4113999903202057},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.38089999556541443},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.3628999888896942},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.34549999237060547},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.3398999869823456},{"id":"https://openalex.org/C2778493491","wikidata":"https://www.wikidata.org/wiki/Q7449072","display_name":"Semantic matching","level":3,"score":0.33480000495910645},{"id":"https://openalex.org/C86034646","wikidata":"https://www.wikidata.org/wiki/Q474311","display_name":"Semantic gap","level":4,"score":0.3328000009059906},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.32179999351501465},{"id":"https://openalex.org/C2993807640","wikidata":"https://www.wikidata.org/wiki/Q103709453","display_name":"Attention network","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.2955000102519989},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C16345878","wikidata":"https://www.wikidata.org/wiki/Q107472979","display_name":"Orientation (vector space)","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C126422989","wikidata":"https://www.wikidata.org/wiki/Q93586","display_name":"Feature detection (computer vision)","level":4,"score":0.27070000767707825},{"id":"https://openalex.org/C1667742","wikidata":"https://www.wikidata.org/wiki/Q10927554","display_name":"Image retrieval","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.26170000433921814},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26019999384880066},{"id":"https://openalex.org/C2983787585","wikidata":"https://www.wikidata.org/wiki/Q93586","display_name":"Feature matching","level":3,"score":0.25870001316070557},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.25450000166893005},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2531999945640564}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3632649","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3632649","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8356228098","display_name":null,"funder_award_id":"62202385","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G946956678","display_name":null,"funder_award_id":"G2021KY05103","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Although":[0],"text":[1],"information":[2,21,119],"has":[3,22,220],"aided":[4],"existing":[5,67,131],"models":[6],"to":[7,24,94,115,154],"achieve":[8],"promising":[9],"results":[10,158],"in":[11,27,46,73],"open":[12],"vocabulary":[13],"object":[14],"detection":[15,30,69],"(OVD),":[16],"the":[17,25,96,117,124,157,160,167,171,175,183,216,221,234],"lack":[18],"of":[19,101,120,159],"semantic":[20,34,118],"led":[23],"difficulty":[26],"small":[28,77,102,121],"objects":[29,122],"(SOD).":[31],"Moreover,":[32],"such":[33],"gap":[35],"also":[36],"causes":[37],"failure":[38],"when":[39],"matching":[40],"texts":[41],"and":[42,98,174,194,208,227,242],"image":[43,172],"features,":[44],"resulting":[45],"false":[47,99],"negative":[48],"instances":[49],"being":[50],"detected.":[51],"To":[52],"address":[53],"these":[54,144],"issues,":[55],"we":[56,81],"propose":[57],"a":[58,83,90,105,147],"Cascade":[59],"Open":[60],"Vocabulary":[61],"Detector":[62],"(Cas-OVD),":[63],"which":[64],"builds":[65],"upon":[66],"multi-stage":[68],"pipelines":[70],"but":[71],"specializes":[72],"text-vision":[74],"alignment":[75,149],"for":[76,139],"objects.":[78,103],"In":[79],"particular,":[80],"adapt":[82],"multi-refined":[84],"region":[85],"proposal":[86],"network,":[87],"guided":[88],"by":[89,202,236],"non-sampled":[91],"anchor":[92],"strategy,":[93],"reduce":[95],"missing":[97],"detections":[100],"Meanwhile,":[104],"deformable":[106],"convolution":[107],"network":[108],"based":[109],"feature":[110,168],"conversion":[111],"module":[112],"is":[113],"proposed":[114],"enhance":[116,166],"even":[123],"potential":[125],"ones":[126],"with":[127],"low":[128],"confidence.":[129],"Unlike":[130],"methods":[132],"that":[133],"rely":[134],"on":[135,156],"coarse-grained":[136],"image-based":[137],"features":[138,145],"image-text":[140],"matching,":[141],"Cas-OVD":[142,187,219],"refines":[143],"through":[146,178],"cascade":[148],"process,":[150],"allowing":[151],"each":[152],"stage":[153],"build":[155],"previous":[161],"one.":[162],"This":[163],"can":[164],"progressively":[165],"correlation":[169],"between":[170],"regions":[173],"textual":[176],"descriptions":[177],"successive":[179],"error":[180],"correction.":[181],"On":[182,215],"joint":[184],"BDD100K-SODA-D":[185],"dataset,":[186,218],"achieved":[188],"17.95%":[189],"AP<inline-formula":[190,196,204,210,223,229,238,244],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[191,197,205,211,224,230,239,245],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[192,198,206,212,225,231,240,246],"notation=\"LaTeX\">$_{\\mathrm{all}}$</tex-math></inline-formula>":[193,207,226,241],"14.6%":[195],"notation=\"LaTeX\">$_{\\mathrm{s}}$</tex-math></inline-formula>,":[199,213,232,247],"outperforming":[200],"RegionCLIP":[201,235],"3.5%":[203],"3.0%":[209],"respectively.":[214,248],"OV_COCO":[217],"32.71%":[222],"17.26%":[228],"surpassing":[233],"6.6%":[237],"6.1%":[243]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-14T00:00:00"}
