{"id":"https://openalex.org/W7123339144","doi":"https://doi.org/10.1109/tcsvt.2026.3652517","title":"Aligning Vision-Language Model With Fine-Grained Semantics for Open-Vocabulary Segmentation","display_name":"Aligning Vision-Language Model With Fine-Grained Semantics for Open-Vocabulary Segmentation","publication_year":2026,"publication_date":"2026-01-12","ids":{"openalex":"https://openalex.org/W7123339144","doi":"https://doi.org/10.1109/tcsvt.2026.3652517"},"language":null,"primary_location":{"id":"doi:10.1109/tcsvt.2026.3652517","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2026.3652517","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5093549325","display_name":"Yong Xien Chng","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Xien Chng","raw_affiliation_strings":["Department of Automation, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Automation, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5044682489","display_name":"Xuchong Qiu","orcid":null},"institutions":[{"id":"https://openalex.org/I4210091347","display_name":"Robert Bosch (China)","ror":"https://ror.org/00cedkn40","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091347","https://openalex.org/I889804353"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuchong Qiu","raw_affiliation_strings":["Bosch Corporate Research, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0000-9191-5588","affiliations":[{"raw_affiliation_string":"Bosch Corporate Research, Shanghai, China","institution_ids":["https://openalex.org/I4210091347"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045569962","display_name":"Yizeng Han","orcid":"https://orcid.org/0000-0001-5706-8784"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yizeng Han","raw_affiliation_strings":["Department of Automation, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0001-5706-8784","affiliations":[{"raw_affiliation_string":"Department of Automation, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061541632","display_name":"Kai Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I4210091347","display_name":"Robert Bosch (China)","ror":"https://ror.org/00cedkn40","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091347","https://openalex.org/I889804353"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kai Ding","raw_affiliation_strings":["Bosch Corporate Research, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Bosch Corporate Research, Shanghai, China","institution_ids":["https://openalex.org/I4210091347"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100543848","display_name":"Wan Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I4210091347","display_name":"Robert Bosch (China)","ror":"https://ror.org/00cedkn40","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210091347","https://openalex.org/I889804353"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wan Ding","raw_affiliation_strings":["Bosch Corporate Research, Shanghai, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Bosch Corporate Research, Shanghai, China","institution_ids":["https://openalex.org/I4210091347"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122884468","display_name":"Gao Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gao Huang","raw_affiliation_strings":["Department of Automation, BNRist, Tsinghua University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0002-7251-0988","affiliations":[{"raw_affiliation_string":"Department of Automation, BNRist, Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.04669338,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"36","issue":"5","first_page":"7368","last_page":"7381"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9617000222206116,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9617000222206116,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.004000000189989805,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.0031999999191612005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.7817000150680542},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6751000285148621},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5698999762535095},{"id":"https://openalex.org/keywords/image-segmentation","display_name":"Image segmentation","score":0.5200999975204468},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5020999908447266},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5015000104904175},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.46309998631477356},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.413100004196167}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8543000221252441},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.7817000150680542},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6751000285148621},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6330999732017517},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5698999762535095},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.5200999975204468},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5020999908447266},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5015000104904175},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.46309998631477356},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.413100004196167},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4115000069141388},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.40070000290870667},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3937000036239624},{"id":"https://openalex.org/C9417928","wikidata":"https://www.wikidata.org/wiki/Q1070689","display_name":"Image processing","level":3,"score":0.3610999882221222},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.326200008392334},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2935999929904938},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.2761000096797943},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C27511587","wikidata":"https://www.wikidata.org/wiki/Q2178623","display_name":"Spatial relation","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2513999938964844},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tcsvt.2026.3652517","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tcsvt.2026.3652517","pdf_url":null,"source":{"id":"https://openalex.org/S115173108","display_name":"IEEE Transactions on Circuits and Systems for Video Technology","issn_l":"1051-8215","issn":["1051-8215","1558-2205"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Circuits and Systems for Video Technology","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7483156323432922}],"awards":[{"id":"https://openalex.org/G2438210826","display_name":null,"funder_award_id":"62321005","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4010507581","display_name":null,"funder_award_id":"U2541227","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5313418749","display_name":null,"funder_award_id":"42327901","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7410564107","display_name":null,"funder_award_id":"62276150","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7612548970","display_name":null,"funder_award_id":"U24B20173","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G8215096727","display_name":null,"funder_award_id":"2021ZD0140407","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Pre-trained":[0],"Vision-Language":[1],"Models":[2],"(VLMs)":[3],"are":[4],"often":[5,61],"used":[6],"to":[7,63,76,107,120,141,146,154,176,211,227,266],"tackle":[8],"the":[9,18,43,78,97,109,118,129,133,139,143,147,160,183,198,203,209,217,222,231],"challenging":[10],"task":[11,105],"of":[12,22,45,113],"Open-vocabulary":[13],"Segmentation":[14],"(OVS).":[15],"To":[16,92],"preserve":[17],"valuable":[19],"pre-trained":[20,55],"knowledge":[21],"VLM-based":[23,173],"mask":[24,51],"classifiers,":[25],"most":[26],"existing":[27],"approaches":[28],"freeze":[29],"their":[30,178],"parameters":[31],"during":[32],"training.":[33],"However,":[34],"our":[35,240],"comprehensive":[36],"analysis":[37],"identifies":[38],"a":[39,102,122,188],"previously":[40],"overlooked":[41],"limitation:":[42],"performance":[44,256],"OVS":[46,253],"is":[47,164],"primarily":[48],"constrained":[49],"by":[50,90,262],"classification.":[52],"Specifically,":[53],"VLMs":[54],"using":[56,128],"globally":[57],"pooled":[58],"image-text":[59],"representations":[60],"fail":[62],"capture":[64],"localized,":[65],"region-specific":[66],"semantics":[67,196],"necessary":[68],"for":[69],"accurate":[70],"segmentation.":[71],"This":[72,137,207],"discovery":[73],"motivates":[74],"us":[75],"improve":[77,177],"fine-grained":[79,195],"alignment":[80],"between":[81],"word-level":[82],"text":[83,199,219,229],"features":[84,88,131],"and":[85,132,156,167,258,272],"pixel-level":[86],"image":[87,130,149],"extracted":[89],"VLMs.":[91],"this":[93],"end,":[94],"we":[95,181],"propose":[96],"Fine-grained":[98],"Semantic":[99],"Reconstruction":[100],"(FiSeR),":[101],"novel":[103],"auxiliary":[104],"designed":[106],"enrich":[108],"spatial":[110,233],"semantic":[111],"detail":[112],"visual":[114,204,214],"features.":[115],"FiSeR":[116,163],"trains":[117],"model":[119,140,210],"predict":[121],"randomly":[123],"masked":[124],"target":[125,218],"class":[126],"label":[127],"remaining":[134],"unmasked":[135],"text.":[136],"encourages":[138],"link":[142],"specific":[144],"words":[145],"corresponding":[148],"regions,":[150],"improving":[151,224,255],"its":[152,213,225],"ability":[153,226],"recognize":[155],"segment":[157],"objects":[158],"at":[159],"region":[161],"level.":[162],"broadly":[165],"applicable":[166],"can":[168],"be":[169,275],"incorporated":[170],"into":[171],"various":[172],"segmentation":[174],"models":[175],"performance.":[179],"Additionally,":[180],"introduce":[182],"Text-guided":[184],"Visual":[185],"Aligner":[186],"(TeVA),":[187],"lightweight":[189],"network":[190],"module":[191],"that":[192],"injects":[193],"relevant":[194],"from":[197,221],"information":[200],"early":[201],"in":[202,239],"encoding":[205],"process.":[206],"enables":[208],"condition":[212],"processing":[215],"on":[216],"categories":[220],"beginning,":[223],"associate":[228],"with":[230],"correct":[232],"regions.":[234],"Collectively,":[235],"these":[236],"innovations":[237],"culminate":[238],"proposed":[241],"framework":[242],"FOV-Seg.":[243],"Notably,":[244],"FOV-Seg":[245],"achieves":[246],"new":[247],"state-of-the-art":[248],"results":[249],"across":[250],"multiple":[251],"representative":[252],"benchmarks,":[254],"consistently":[257],"reducing":[259],"training":[260],"costs":[261],"nearly":[263],"5\u00d7":[264],"compared":[265],"previous":[267],"best":[268],"methods.":[269],"Our":[270],"code":[271],"data":[273],"will":[274],"released.":[276]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-01-14T00:00:00"}
