{"id":"https://openalex.org/W4401931559","doi":"https://doi.org/10.1007/s41095-024-0430-4","title":"CLIP-SP: Vision-language model with adaptive prompting for scene parsing","display_name":"CLIP-SP: Vision-language model with adaptive prompting for scene parsing","publication_year":2024,"publication_date":"2024-08-01","ids":{"openalex":"https://openalex.org/W4401931559","doi":"https://doi.org/10.1007/s41095-024-0430-4"},"language":"en","primary_location":{"id":"doi:10.1007/s41095-024-0430-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s41095-024-0430-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s41095-024-0430-4.pdf","source":{"id":"https://openalex.org/S2487656537","display_name":"Computational Visual Media","issn_l":"2096-0433","issn":["2096-0433","2096-0662"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Visual Media","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://link.springer.com/content/pdf/10.1007/s41095-024-0430-4.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004276261","display_name":"J.-L. F. Li","orcid":"https://orcid.org/0009-0007-3818-8554"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiaao Li","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100755268","display_name":"Yixiang Huang","orcid":"https://orcid.org/0000-0003-0809-7890"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yixiang Huang","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077902436","display_name":"Ming Wu","orcid":"https://orcid.org/0000-0001-8390-5398"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ming Wu","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392947","display_name":"Bin Zhang","orcid":"https://orcid.org/0009-0008-7206-144X"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Zhang","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103050510","display_name":"Xu Ji","orcid":null},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xu Ji","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5072209676","display_name":"Chuang Zhang","orcid":"https://orcid.org/0000-0001-6499-8509"},"institutions":[{"id":"https://openalex.org/I139759216","display_name":"Beijing University of Posts and Telecommunications","ror":"https://ror.org/04w9fbh59","country_code":"CN","type":"education","lineage":["https://openalex.org/I139759216"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chuang Zhang","raw_affiliation_strings":["School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing 100876, China","institution_ids":["https://openalex.org/I139759216"]},{"raw_affiliation_string":"School of Artificial Intelligence, Beijing University of Posts and Telecommunications, Beijing, 100876, China","institution_ids":["https://openalex.org/I139759216"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5004276261"],"corresponding_institution_ids":["https://openalex.org/I139759216"],"apc_list":null,"apc_paid":null,"fwci":1.2682,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.81064438,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":"10","issue":"4","first_page":"741","last_page":"752"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.8566087484359741},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8451125025749207},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.7499487400054932},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7256537675857544},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6854804754257202},{"id":"https://openalex.org/keywords/pixel","display_name":"Pixel","score":0.566068172454834},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4668600857257843},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.44675469398498535},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.429951936006546},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4144335985183716},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.37619221210479736},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.33430013060569763}],"concepts":[{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.8566087484359741},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8451125025749207},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.7499487400054932},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7256537675857544},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6854804754257202},{"id":"https://openalex.org/C160633673","wikidata":"https://www.wikidata.org/wiki/Q355198","display_name":"Pixel","level":2,"score":0.566068172454834},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4668600857257843},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.44675469398498535},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.429951936006546},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4144335985183716},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37619221210479736},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33430013060569763},{"id":"https://openalex.org/C78519656","wikidata":"https://www.wikidata.org/wiki/Q101333","display_name":"Mechanical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1007/s41095-024-0430-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s41095-024-0430-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s41095-024-0430-4.pdf","source":{"id":"https://openalex.org/S2487656537","display_name":"Computational Visual Media","issn_l":"2096-0433","issn":["2096-0433","2096-0662"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Visual Media","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1007/s41095-024-0430-4","is_oa":true,"landing_page_url":"https://doi.org/10.1007/s41095-024-0430-4","pdf_url":"https://link.springer.com/content/pdf/10.1007/s41095-024-0430-4.pdf","source":{"id":"https://openalex.org/S2487656537","display_name":"Computational Visual Media","issn_l":"2096-0433","issn":["2096-0433","2096-0662"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319965","host_organization_name":"Springer Nature","host_organization_lineage":["https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Nature"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Computational Visual Media","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4401931559.pdf","grobid_xml":"https://content.openalex.org/works/W4401931559.grobid-xml"},"referenced_works_count":16,"referenced_works":["https://openalex.org/W1583837637","https://openalex.org/W2507296351","https://openalex.org/W2884822772","https://openalex.org/W2912083425","https://openalex.org/W2964309882","https://openalex.org/W3095707208","https://openalex.org/W3107634219","https://openalex.org/W3109301572","https://openalex.org/W3135367836","https://openalex.org/W4226021361","https://openalex.org/W4234552385","https://openalex.org/W4236965008","https://openalex.org/W4312744505","https://openalex.org/W4312960937","https://openalex.org/W4386076681","https://openalex.org/W6600855501"],"related_works":["https://openalex.org/W3135697610","https://openalex.org/W579810227","https://openalex.org/W2085033728","https://openalex.org/W4285411112","https://openalex.org/W2171299904","https://openalex.org/W2952780262","https://openalex.org/W2979495269","https://openalex.org/W2392917763","https://openalex.org/W1647606319","https://openalex.org/W2922442631"],"abstract_inverted_index":{"We":[0,54],"present":[1],"a":[2,7,64,84,150,193],"novel":[3,8],"framework,":[4],"CLIP-SP,":[5],"and":[6,91,94,114,134,154],"adaptive":[9,127],"prompt":[10],"method":[11,118,148,168],"to":[12,98],"leverage":[13],"pre-trained":[14,37,41,177],"knowledge":[15],"from":[16],"CLIP":[17,36],"for":[18,50,129,145],"scene":[19,52,120,164],"parsing.":[20,53],"Our":[21,147,167,179],"approach":[22],"addresses":[23],"the":[24,30,66,100,105,111,138,143,156,163,174],"limitations":[25],"of":[26,132,158,186],"DenseCLIP,":[27],"which":[28,122],"demonstrates":[29],"superior":[31],"image":[32],"segmentation":[33],"provided":[34],"by":[35,173],"models":[38],"over":[39],"ImageNet":[40],"models,":[42],"but":[43],"struggles":[44],"with":[45,76],"rough":[46],"pixel-text":[47,67],"score":[48,68],"maps":[49],"complex":[51],"argue":[55],"that,":[56],"as":[57,126],"they":[58],"contain":[59],"all":[60],"textual":[61],"information":[62],"in":[63,104,162],"dataset,":[65],"maps,":[69],"i.e.,":[70],"dense":[71],"prompts,":[72],"are":[73],"inevitably":[74],"mixed":[75],"noise.":[77],"To":[78],"overcome":[79],"this":[80],"challenge,":[81],"we":[82,88],"propose":[83],"two-step":[85],"method.":[86],"Firstly,":[87],"extract":[89],"visual":[90,139],"language":[92],"features":[93,140],"perform":[95],"multi-label":[96],"classification":[97],"identify":[99],"most":[101],"likely":[102],"categories":[103,113,160],"input":[106],"images.":[107],"Secondly,":[108],"based":[109],"on":[110,152,190],"top-k":[112],"confidence":[115],"scores,":[116],"our":[117],"generates":[119],"tokens":[121],"can":[123],"be":[124],"treated":[125],"prompts":[128,153],"implicit":[130],"modeling":[131],"scenes,":[133],"incorporates":[135],"them":[136],"into":[137,142],"fed":[141],"decoder":[144],"segmentation.":[146],"imposes":[149],"constraint":[151],"suppresses":[155],"probability":[157],"irrelevant":[159],"appearing":[161],"parsing":[165],"results.":[166],"achieves":[169],"competitive":[170],"performance,":[171],"limited":[172],"available":[175],"visual-language":[176],"models.":[178],"CLIP-SP":[180],"performs":[181],"1.14%":[182],"better":[183],"(in":[184],"terms":[185],"mIoU)":[187],"than":[188],"DenseCLIP":[189],"ADE20K,":[191],"using":[192],"ResNet-50":[194],"backbone.":[195]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":4}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
