{"id":"https://openalex.org/W4414054289","doi":"https://doi.org/10.1109/tpami.2025.3607387","title":"Improving Generalized Visual Grounding With Instance-Aware Joint Learning","display_name":"Improving Generalized Visual Grounding With Instance-Aware Joint Learning","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W4414054289","doi":"https://doi.org/10.1109/tpami.2025.3607387","pmid":"https://pubmed.ncbi.nlm.nih.gov/40920536"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3607387","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3607387","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.13747","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Ming Dai","orcid":"https://orcid.org/0009-0004-6133-0035"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Ming Dai","raw_affiliation_strings":["School of Automation, Southeast University, Nanjing, China","School of Automation, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"School of Automation, Southeast University, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Automation, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068926305","display_name":"Wenxuan Cheng","orcid":"https://orcid.org/0000-0003-1734-5139"},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenxuan Cheng","raw_affiliation_strings":["School of Automation, Southeast University, Nanjing, China","School of Automation, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"School of Automation, Southeast University, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Automation, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059524537","display_name":"Jiangjiang Liu","orcid":"https://orcid.org/0000-0002-1341-2763"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiang-Jiang Liu","raw_affiliation_strings":["Baidu Inc., Beijing, China","Baidu Inc., China"],"affiliations":[{"raw_affiliation_string":"Baidu Inc., Beijing, China","institution_ids":["https://openalex.org/I98301712"]},{"raw_affiliation_string":"Baidu Inc., China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100604251","display_name":"Lingfeng Yang","orcid":"https://orcid.org/0000-0002-2725-8947"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lingfeng Yang","raw_affiliation_strings":["Nanjing University of Science and Technology, Nanjing, China","Nanjing University of Science and Technology, China"],"affiliations":[{"raw_affiliation_string":"Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]},{"raw_affiliation_string":"Nanjing University of Science and Technology, China","institution_ids":["https://openalex.org/I36399199"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025299678","display_name":"Zhenhua Feng","orcid":"https://orcid.org/0000-0002-4485-4249"},"institutions":[{"id":"https://openalex.org/I111599522","display_name":"Jiangnan University","ror":"https://ror.org/04mkzax54","country_code":"CN","type":"education","lineage":["https://openalex.org/I111599522"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhenhua Feng","raw_affiliation_strings":["JiangNan University, Wuxi, China","JiangNan University, China"],"affiliations":[{"raw_affiliation_string":"JiangNan University, Wuxi, China","institution_ids":["https://openalex.org/I111599522"]},{"raw_affiliation_string":"JiangNan University, China","institution_ids":["https://openalex.org/I111599522"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074121246","display_name":"Wankou Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wankou Yang","raw_affiliation_strings":["School of Automation, Southeast University, Nanjing, China","School of Automation, Southeast University, China"],"affiliations":[{"raw_affiliation_string":"School of Automation, Southeast University, Nanjing, China","institution_ids":["https://openalex.org/I76569877"]},{"raw_affiliation_string":"School of Automation, Southeast University, China","institution_ids":["https://openalex.org/I76569877"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jingdong Wang","orcid":"https://orcid.org/0000-0002-4888-4445"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingdong Wang","raw_affiliation_strings":["Baidu Inc., Beijing, China","Baidu Inc., China"],"affiliations":[{"raw_affiliation_string":"Baidu Inc., Beijing, China","institution_ids":["https://openalex.org/I98301712"]},{"raw_affiliation_string":"Baidu Inc., China","institution_ids":["https://openalex.org/I98301712"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I76569877"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.22540905,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"48","issue":"1","first_page":"448","last_page":"465"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9940000176429749,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9907000064849854,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.6773999929428101},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6072999835014343},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.600600004196167},{"id":"https://openalex.org/keywords/joint","display_name":"Joint (building)","score":0.5223000049591064},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.44029998779296875},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.43209999799728394},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.4221000075340271},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.3837999999523163}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8163999915122986},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.6773999929428101},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6183000206947327},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6072999835014343},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.600600004196167},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.5223000049591064},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.44029998779296875},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.43209999799728394},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.4221000075340271},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3837999999523163},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3799999952316284},{"id":"https://openalex.org/C12426560","wikidata":"https://www.wikidata.org/wiki/Q189569","display_name":"Basis (linear algebra)","level":2,"score":0.35670000314712524},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.313400000333786},{"id":"https://openalex.org/C2777055276","wikidata":"https://www.wikidata.org/wiki/Q7936580","display_name":"Visual approach","level":2,"score":0.2833999991416931},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.274399995803833},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2736000120639801},{"id":"https://openalex.org/C168820333","wikidata":"https://www.wikidata.org/wiki/Q448889","display_name":"Visual inspection","level":2,"score":0.26840001344680786},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2563999891281128}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tpami.2025.3607387","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3607387","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:40920536","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40920536","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null},{"id":"pmh:oai:arXiv.org:2509.13747","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.13747","pdf_url":"https://arxiv.org/pdf/2509.13747","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.13747","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.13747","pdf_url":"https://arxiv.org/pdf/2509.13747","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G7077580569","display_name":null,"funder_award_id":"62436002","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G782967413","display_name":null,"funder_award_id":"62276061","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407"],"abstract_inverted_index":{"Generalized":[0,5],"visual":[1,16,115,162],"grounding":[2,17,116],"tasks,":[3],"including":[4],"Referring":[6],"Expression":[7],"Comprehension":[8],"(GREC)":[9],"and":[10,22,63,70,93,103,130,136,154,195,227],"Segmentation":[11],"(GRES),":[12],"extend":[13],"the":[14,35,57,72,87,94,128,139,146,166,198,218],"classical":[15],"paradigm":[18],"by":[19],"accommodating":[20],"multi-target":[21],"non-target":[23],"scenarios.":[24],"Specifically,":[25],"GREC":[26,62,153],"focuses":[27],"on":[28,204],"accurately":[29],"identifying":[30],"all":[31],"referential":[32],"objects":[33],"at":[34,233],"coarse":[36],"bounding":[37],"box":[38],"level,":[39],"while":[40,156],"GRES":[41,64,80,155],"aims":[42],"for":[43,184,197],"achieve":[44],"fine-grained":[45],"pixel-level":[46],"perception.":[47],"However,":[48],"existing":[49,219],"approaches":[50],"typically":[51],"treat":[52,79],"these":[53,107],"tasks":[54,209],"independently,":[55],"overlooking":[56],"benefits":[58],"of":[59,90,96,133,141,192],"jointly":[60],"training":[61],"to":[65,126,149],"ensure":[66],"consistent":[67,98,190],"multi-granularity":[68],"predictions":[69,99,132,191],"streamline":[71],"overall":[73],"process.":[74],"Moreover,":[75],"current":[76],"methods":[77,220],"often":[78],"as":[81,180],"a":[82,112,173],"semantic":[83],"segmentation":[84],"task,":[85],"neglecting":[86],"crucial":[88],"role":[89],"instance-aware":[91,120,158],"capabilities":[92,159],"necessity":[95],"ensuring":[97],"between":[100],"instance-level":[101,134],"boxes":[102,135],"masks.":[104,137],"To":[105,138,164],"address":[106],"limitations,":[108],"we":[109,168],"propose":[110],"InstanceVG,":[111],"multi-task":[113],"generalized":[114,161],"framework":[117,148],"equipped":[118],"with":[119],"capabilities,":[121],"which":[122,177],"leverages":[123],"instance":[124,171],"queries":[125],"unify":[127],"joint":[129],"consistency":[131],"best":[140],"our":[142],"knowledge,":[143],"InstanceVG":[144,212],"is":[145],"first":[147],"simultaneously":[150],"tackle":[151],"both":[152],"incorporating":[157],"into":[160],"grounding.":[163],"instantiate":[165],"framework,":[167],"assign":[169],"each":[170],"query":[172],"prior":[174],"reference":[175],"point,":[176],"also":[178],"serves":[179],"an":[181],"additional":[182],"basis":[183],"target":[185],"matching.":[186],"This":[187],"design":[188],"facilitates":[189],"points,":[193],"boxes,":[194],"masks":[196],"same":[199],"instance.":[200],"Extensive":[201],"experiments":[202],"obtained":[203],"ten":[205],"datasets":[206],"across":[207],"four":[208],"demonstrate":[210],"that":[211],"achieves":[213],"state-of-the-art":[214],"performance,":[215],"significantly":[216],"surpassing":[217],"in":[221],"various":[222],"evaluation":[223],"metrics.":[224],"The":[225],"code":[226],"model":[228],"will":[229],"be":[230],"publicly":[231],"available":[232],"https://github.com/Dmmm1997/InstanceVG.":[234]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
