{"id":"https://openalex.org/W7084957759","doi":"https://doi.org/10.1109/tmm.2025.3618539","title":"P$^{2}$M: Progressive Perspective Mining for Referring Video Object Segmentation","display_name":"P$^{2}$M: Progressive Perspective Mining for Referring Video Object Segmentation","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W7084957759","doi":"https://doi.org/10.1109/tmm.2025.3618539"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2025.3618539","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618539","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Yihan Wang","orcid":"https://orcid.org/0000-0002-5295-1508"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yihan Wang","raw_affiliation_strings":["School of Computer Science and Technology, Dalian University of Technology, Dalian, China"],"raw_orcid":"https://orcid.org/0000-0002-5295-1508","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Baoli Sun","orcid":"https://orcid.org/0000-0002-2861-4288"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baoli Sun","raw_affiliation_strings":["School of Computer Science and Technology, Dalian University of Technology, Dalian, China"],"raw_orcid":"https://orcid.org/0000-0002-2861-4288","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xinzhu Ma","orcid":"https://orcid.org/0000-0003-0504-0186"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinzhu Ma","raw_affiliation_strings":["Beihang University, Beijing, China"],"raw_orcid":"https://orcid.org/0000-0003-0504-0186","affiliations":[{"raw_affiliation_string":"Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hongwei Ge","orcid":"https://orcid.org/0000-0002-8937-1515"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hongwei Ge","raw_affiliation_strings":["School of Computer Science and Technology, Dalian University of Technology, Dalian, China"],"raw_orcid":"https://orcid.org/0000-0002-8937-1515","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiulin Fan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140589","display_name":"China Huadian Corporation (China)","ror":"https://ror.org/03wj9h986","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210140589"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiulin Fan","raw_affiliation_strings":["Huadian Coal Group Digital Intelligence Technology Company, Huadian Coal Industry Group Company Ltd., Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Huadian Coal Group Digital Intelligence Technology Company, Huadian Coal Industry Group Company Ltd., Beijing, China","institution_ids":["https://openalex.org/I4210140589"]}]},{"author_position":"last","author":{"id":null,"display_name":"Haojie Li","orcid":"https://orcid.org/0000-0003-3882-2205"},"institutions":[{"id":"https://openalex.org/I80143920","display_name":"Shandong University of Science and Technology","ror":"https://ror.org/04gtjhw98","country_code":"CN","type":"education","lineage":["https://openalex.org/I80143920"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haojie Li","raw_affiliation_strings":["Shandong Science and Technology, Qingdao, China"],"raw_orcid":"https://orcid.org/0000-0003-3882-2205","affiliations":[{"raw_affiliation_string":"Shandong Science and Technology, Qingdao, China","institution_ids":["https://openalex.org/I80143920"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I27357992"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.63488632,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"27","issue":null,"first_page":"9749","last_page":"9760"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T10120","display_name":"Bacterial Genetics and Biotechnology","score":0.07419999688863754,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10120","display_name":"Bacterial Genetics and Biotechnology","score":0.07419999688863754,"subfield":{"id":"https://openalex.org/subfields/1311","display_name":"Genetics"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12167","display_name":"Bacterial Identification and Susceptibility Testing","score":0.05770000070333481,"subfield":{"id":"https://openalex.org/subfields/1308","display_name":"Clinical Biochemistry"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11288","display_name":"Clostridium difficile and Clostridium perfringens research","score":0.0560000017285347,"subfield":{"id":"https://openalex.org/subfields/2725","display_name":"Infectious Diseases"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6693000197410583},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.6625000238418579},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6482999920845032},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.6434999704360962},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.5985000133514404},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5253999829292297},{"id":"https://openalex.org/keywords/video-tracking","display_name":"Video tracking","score":0.48730000853538513},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.4650000035762787},{"id":"https://openalex.org/keywords/dimension","display_name":"Dimension (graph theory)","score":0.43650001287460327}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8822000026702881},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6693000197410583},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.6625000238418579},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6482999920845032},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.6434999704360962},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.630299985408783},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5985000133514404},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5253999829292297},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.48730000853538513},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.4650000035762787},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.43650001287460327},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4156000018119812},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4016999900341034},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.367000013589859},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C124504099","wikidata":"https://www.wikidata.org/wiki/Q56933","display_name":"Image segmentation","level":3,"score":0.35350000858306885},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.3450999855995178},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.34040001034736633},{"id":"https://openalex.org/C2778493491","wikidata":"https://www.wikidata.org/wiki/Q7449072","display_name":"Semantic matching","level":3,"score":0.329800009727478},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3228999972343445},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.30410000681877136},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.29499998688697815},{"id":"https://openalex.org/C98501671","wikidata":"https://www.wikidata.org/wiki/Q1948408","display_name":"Text segmentation","level":3,"score":0.2784999907016754},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C173414695","wikidata":"https://www.wikidata.org/wiki/Q5510276","display_name":"Fusion mechanism","level":4,"score":0.2761000096797943},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26179999113082886},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.25450000166893005},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3618539","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3618539","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4378652050","display_name":null,"funder_award_id":"61976034","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G4386212477","display_name":null,"funder_award_id":"2022-YGJC20","funder_id":"https://openalex.org/F4320323086","funder_display_name":"Natural Science Foundation of Liaoning Province"},{"id":"https://openalex.org/G6064101977","display_name":null,"funder_award_id":"2022JJ12GX013","funder_id":"https://openalex.org/F4320336584","funder_display_name":"Dalian Science and Technology Innovation Fund"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320323086","display_name":"Natural Science Foundation of Liaoning Province","ror":null},{"id":"https://openalex.org/F4320336584","display_name":"Dalian Science and Technology Innovation Fund","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W1905722737","https://openalex.org/W2034014085","https://openalex.org/W2222512263","https://openalex.org/W2251512949","https://openalex.org/W2302548814","https://openalex.org/W2560474170","https://openalex.org/W2916797271","https://openalex.org/W2948429466","https://openalex.org/W2962942822","https://openalex.org/W2963253279","https://openalex.org/W2963354481","https://openalex.org/W2963626623","https://openalex.org/W2964284374","https://openalex.org/W2980088508","https://openalex.org/W2983693499","https://openalex.org/W2990205821","https://openalex.org/W2997487053","https://openalex.org/W3034578524","https://openalex.org/W3096609285","https://openalex.org/W3104844437","https://openalex.org/W3108432914","https://openalex.org/W3108748824","https://openalex.org/W3109908659","https://openalex.org/W3122784054","https://openalex.org/W3170511209","https://openalex.org/W3177892185","https://openalex.org/W3178075329","https://openalex.org/W3186378100","https://openalex.org/W3202858245","https://openalex.org/W3215899623","https://openalex.org/W4200631575","https://openalex.org/W4224988000","https://openalex.org/W4226024706","https://openalex.org/W4283796148","https://openalex.org/W4285200050","https://openalex.org/W4304084115","https://openalex.org/W4307504011","https://openalex.org/W4312560592","https://openalex.org/W4312690830","https://openalex.org/W4312960790","https://openalex.org/W4312981390","https://openalex.org/W4386076148","https://openalex.org/W4386108423","https://openalex.org/W4389371459","https://openalex.org/W4390872515","https://openalex.org/W4390872868","https://openalex.org/W4390873170","https://openalex.org/W4390873204","https://openalex.org/W4390873614","https://openalex.org/W4390874496","https://openalex.org/W4391164254","https://openalex.org/W4402727770","https://openalex.org/W4402754173"],"related_works":[],"abstract_inverted_index":{"Referring":[0],"video":[1,17,35,143],"object":[2,9,59,157,225],"segmentation":[3,99,160],"(RVOS)":[4],"aims":[5],"to":[6,12,55,89,134,166],"segment":[7],"the":[8,39,52,184,189,198,201,207,211,215,230,236,240,248,260],"instances":[10],"referred":[11,101],"by":[13,161],"linguistic":[14,174],"expressions":[15],"in":[16,58,64],"frames.":[18],"The":[19],"prevailing":[20],"approaches":[21],"mainly":[22],"rely":[23],"on":[24,153,247],"simplistic":[25],"fusion":[26,49],"strategies,":[27],"wherein":[28],"textual":[29,42,136,241],"features":[30,36,127,144,172],"are":[31],"directly":[32],"interacted":[33],"with":[34,142,173],"without":[37],"considering":[38],"impact":[40],"of":[41,100,110,200,239,262],"semantics":[43],"at":[44],"different":[45],"levels.":[46],"These":[47],"coarse-grained":[48],"strategies":[50],"hinder":[51],"model's":[53],"ability":[54],"perceive":[56],"changes":[57],"appearance":[60],"and":[61,95,118,131,168,194,227,234],"movement,":[62],"resulting":[63],"performance":[65],"degradation.":[66],"To":[67],"mitigate":[68],"this":[69],"issue,":[70],"we":[71,177],"propose":[72],"a":[73,86,139,221],"Progressive":[74,114],"Perspective":[75],"Mining":[76],"(P":[77],"<inline-formula":[78,104,264],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[79,105,265],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[80,106,266],"notation=\"LaTeX\">$^{2}$</tex-math></inline-formula>":[81,107,267],"M)":[82],"framework,":[83],"which":[84],"leverages":[85,125],"coarse-to-fine":[87],"perspective":[88],"mine":[90,135,167],"latent":[91],"information":[92],"from":[93,204],"text":[94],"video,":[96],"enabling":[97,138],"precise":[98],"objects.":[102],"P":[103,263],"M":[108],"consists":[109],"two":[111,179],"key":[112],"components:":[113],"Vision-Language":[115,119,216],"Interaction":[116],"(PVLI)":[117],"Synergistic":[120],"Fusion":[121],"(VLSF).":[122],"Specifically,":[123],"PVLI":[124],"language":[126],"across":[128,255],"subject,":[129],"word,":[130],"sentence":[132],"levels":[133],"information,":[137],"progressive":[140],"interaction":[141],"within":[145],"an":[146],"integrated":[147],"representational":[148],"space.":[149],"Concurrently,":[150],"VLSF":[151],"focuses":[152],"generating":[154],"semantically":[155],"rich":[156],"queries":[158,191,199,226],"for":[159],"employing":[162],"slot":[163],"attention":[164],"mechanisms":[165],"integrate":[169],"relevant":[170],"visual":[171],"semantics.":[175],"Furthermore,":[176],"introduce":[178],"query":[180],"optimization":[181],"losses:":[182],"(1)":[183],"Matching":[185],"Optimization":[186],"Loss":[187,219],"constrains":[188],"best":[190],"between":[192,224],"frame-level":[193],"video-level,":[195],"effectively":[196],"preventing":[197],"tracking":[202],"target":[203],"drifting":[205],"along":[206],"temporal":[208],"dimension":[209],"during":[210],"inference":[212],"phase;":[213],"(2)":[214],"Semantic":[217],"Alignment":[218],"performs":[220],"word-by-word":[222],"matching":[223],"expression,":[228],"aligning":[229],"multi-modal":[231],"joint":[232],"space":[233],"enhancing":[235],"framework's":[237],"understanding":[238],"description.":[242],"We":[243],"conducted":[244],"various":[245],"experiments":[246],"RVOS":[249],"task,":[250],"achieving":[251],"new":[252],"state-of-the-art":[253],"results":[254],"all":[256],"benchmarks,":[257],"thereby":[258],"demonstrating":[259],"effectiveness":[261],"M.":[268]},"counts_by_year":[],"updated_date":"2025-12-19T00:32:22.182498","created_date":"2025-10-10T00:00:00"}
