{"id":"https://openalex.org/W4405626818","doi":"https://doi.org/10.1109/iccv51701.2025.01878","title":"Instructseg: Unifying Instructed Visual Segmentation with Multi-Modal Large Language Models","display_name":"Instructseg: Unifying Instructed Visual Segmentation with Multi-Modal Large Language Models","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4405626818","doi":"https://doi.org/10.1109/iccv51701.2025.01878"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01878","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01878","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.14006","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100390457","display_name":"Cong Wei","orcid":"https://orcid.org/0000-0002-1492-7240"},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Cong Wei","raw_affiliation_strings":["Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087137290","display_name":"Yujie Zhong","orcid":"https://orcid.org/0009-0003-6481-3916"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yujie Zhong","raw_affiliation_strings":["Meituan Inc.,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meituan Inc.,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071239924","display_name":"Haoxian Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Haoxian Tan","raw_affiliation_strings":["Meituan Inc.,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meituan Inc.,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072520602","display_name":"Yingsen Zeng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yingsen Zeng","raw_affiliation_strings":["Meituan Inc.,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Meituan Inc.,China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100705504","display_name":"Yong Liu","orcid":"https://orcid.org/0000-0002-2510-9470"},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yong Liu","raw_affiliation_strings":["Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072745289","display_name":"Zheng Zhao","orcid":"https://orcid.org/0000-0002-0916-0444"},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongfa Wang","raw_affiliation_strings":["Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I99065089"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020953714","display_name":"Yujiu Yang","orcid":"https://orcid.org/0000-0002-6427-1024"},"institutions":[{"id":"https://openalex.org/I4210114105","display_name":"Tsinghua\u2013Berkeley Shenzhen Institute","ror":"https://ror.org/02hhwwz98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210114105","https://openalex.org/I95457486","https://openalex.org/I99065089"]},{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yujiu Yang","raw_affiliation_strings":["Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University,Tsinghua Shenzhen International Graduate School,Shenzhen,China","institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5100390457"],"corresponding_institution_ids":["https://openalex.org/I4210114105","https://openalex.org/I99065089"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00134224,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"20193","last_page":"20203"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9879000186920166,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9811999797821045,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.676762580871582},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.60245680809021},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.5983800292015076},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.46066609025001526},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.41898152232170105},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3485472798347473},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.09274712204933167}],"concepts":[{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.676762580871582},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.60245680809021},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.5983800292015076},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46066609025001526},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41898152232170105},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3485472798347473},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.09274712204933167},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01878","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01878","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2412.14006","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.14006","pdf_url":"https://arxiv.org/pdf/2412.14006","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.14006","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.14006","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.14006","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.14006","pdf_url":"https://arxiv.org/pdf/2412.14006","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5436302066","display_name":null,"funder_award_id":"2024YFB2808903","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405626818.pdf","grobid_xml":"https://content.openalex.org/works/W4405626818.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2379392295","https://openalex.org/W3160965418","https://openalex.org/W4379231730","https://openalex.org/W613940353","https://openalex.org/W2320915480","https://openalex.org/W4389858081","https://openalex.org/W2362990116","https://openalex.org/W2381300099","https://openalex.org/W2501551404","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Boosted":[0],"by":[1],"Multi-modal":[2],"Large":[3],"Language":[4],"Models":[5],"(MLLMs),":[6],"text-guided":[7],"universal":[8],"segmentation":[9,53,56,75,136,140],"models":[10],"for":[11,29,80],"the":[12,33,49,59],"image":[13,60,133],"and":[14,38,54,61,92,113,124,134,142],"video":[15,62,87,100,135],"domains":[16],"have":[17],"made":[18],"rapid":[19],"progress":[20],"recently.":[21],"However,":[22],"these":[23,41],"methods":[24,144],"are":[25],"often":[26],"developed":[27],"separately":[28],"specific":[30],"domains,":[31],"overlooking":[32],"similarities":[34],"in":[35],"task":[36],"settings":[37],"solutions":[39],"across":[40,131],"two":[42],"areas.":[43],"In":[44],"this":[45],"paper,":[46],"we":[47,70,83,103],"define":[48],"union":[50],"of":[51],"referring":[52],"reasoning":[55],"at":[57,153],"both":[58,139],"levels":[63],"as":[64],"Instructed":[65],"Visual":[66],"Segmentation":[67],"(IVS).":[68],"Correspondingly,":[69],"propose":[71],"InstructSeg,":[72],"an":[73,85],"end-to-end":[74,125],"pipeline":[76],"equipped":[77],"with":[78,117,145],"MLLMs":[79],"IVS.":[81],"Specifically,":[82],"employ":[84],"object-aware":[86],"perceiver":[88],"to":[89,109],"extract":[90],"temporal":[91],"object":[93],"information":[94,116],"from":[95],"reference":[96],"frames,":[97],"facilitating":[98],"comprehensive":[99],"understanding.":[101],"Additionally,":[102],"introduce":[104],"vision-guided":[105],"multi-granularity":[106],"text":[107,115],"fusion":[108],"better":[110],"integrate":[111],"global":[112],"detailed":[114],"fine-grained":[118],"visual":[119],"guidance.":[120],"By":[121],"leveraging":[122],"multi-task":[123],"training,":[126],"InstructSeg":[127],"demonstrates":[128],"superior":[129],"performance":[130],"diverse":[132],"tasks,":[137],"surpassing":[138],"specialists":[141],"MLLM-based":[143],"a":[146],"single":[147],"model.":[148],"Our":[149],"code":[150],"is":[151],"available":[152],"https://github.com/congvvc/InstructSeg.":[154]},"counts_by_year":[],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
