{"id":"https://openalex.org/W4409494545","doi":"https://doi.org/10.1109/tpami.2025.3561366","title":"End-to-End Open-Vocabulary Video Visual Relationship Detection Using Multi-Modal Prompting","display_name":"End-to-End Open-Vocabulary Video Visual Relationship Detection Using Multi-Modal Prompting","publication_year":2025,"publication_date":"2025-04-16","ids":{"openalex":"https://openalex.org/W4409494545","doi":"https://doi.org/10.1109/tpami.2025.3561366","pmid":"https://pubmed.ncbi.nlm.nih.gov/40238600"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3561366","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3561366","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101867694","display_name":"Yongqi Wang","orcid":"https://orcid.org/0000-0001-7889-2262"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yongqi Wang","raw_affiliation_strings":["Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science and Technology, Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science and Technology, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011070646","display_name":"Xinxiao Wu","orcid":"https://orcid.org/0000-0002-2056-6947"},"institutions":[{"id":"https://openalex.org/I125839683","display_name":"Beijing Institute of Technology","ror":"https://ror.org/01skt4w74","country_code":"CN","type":"education","lineage":["https://openalex.org/I125839683","https://openalex.org/I890469752"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinxiao Wu","raw_affiliation_strings":["Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science and Technology, Beijing Institute of Technology, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Beijing Key Laboratory of Intelligent Information Technology, School of Computer Science and Technology, Beijing Institute of Technology, Beijing, China","institution_ids":["https://openalex.org/I125839683"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008524479","display_name":"Shuo Yang","orcid":"https://orcid.org/0000-0003-2868-7070"},"institutions":[{"id":"https://openalex.org/I4388482657","display_name":"Shenzhen MSU-BIT University","ror":"https://ror.org/02q963474","country_code":null,"type":"education","lineage":["https://openalex.org/I4388482657"]},{"id":"https://openalex.org/I180726961","display_name":"Shenzhen University","ror":"https://ror.org/01vy4gh70","country_code":"CN","type":"education","lineage":["https://openalex.org/I180726961"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuo Yang","raw_affiliation_strings":["Guangdong Provincial Laboratory of Machine Perception and Intelligent Computing, Shenzhen MSU-BIT University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"Guangdong Provincial Laboratory of Machine Perception and Intelligent Computing, Shenzhen MSU-BIT University, Shenzhen, China","institution_ids":["https://openalex.org/I180726961","https://openalex.org/I4388482657"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055469774","display_name":"Jiebo Luo","orcid":"https://orcid.org/0000-0002-4516-9729"},"institutions":[{"id":"https://openalex.org/I5388228","display_name":"University of Rochester","ror":"https://ror.org/022kthw22","country_code":"US","type":"education","lineage":["https://openalex.org/I5388228"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiebo Luo","raw_affiliation_strings":["Department of Computer Science, University of Rochester, Rochester, NY, USA","Department of Computer Science, University of Rochester, Rochester, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Rochester, Rochester, NY, USA","institution_ids":["https://openalex.org/I5388228"]},{"raw_affiliation_string":"Department of Computer Science, University of Rochester, Rochester, USA","institution_ids":["https://openalex.org/I5388228"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101867694"],"corresponding_institution_ids":["https://openalex.org/I125839683"],"apc_list":null,"apc_paid":null,"fwci":1.319,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.79379182,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"47","issue":"8","first_page":"6599","last_page":"6615"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9958999752998352,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7338858246803284},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.6658955812454224},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6538228988647461},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.6410026550292969},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5656237602233887},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.5107300281524658},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3925146460533142},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.36383646726608276}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7338858246803284},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.6658955812454224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6538228988647461},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.6410026550292969},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5656237602233887},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.5107300281524658},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3925146460533142},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.36383646726608276},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2025.3561366","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3561366","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:40238600","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40238600","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.4699999988079071,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":65,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2603203130","https://openalex.org/W2765137706","https://openalex.org/W2962766617","https://openalex.org/W2963351448","https://openalex.org/W2963536419","https://openalex.org/W2963691377","https://openalex.org/W2981385984","https://openalex.org/W2981665364","https://openalex.org/W2982515679","https://openalex.org/W3035503132","https://openalex.org/W3096609285","https://openalex.org/W3105232955","https://openalex.org/W3186621246","https://openalex.org/W3193302808","https://openalex.org/W3198377975","https://openalex.org/W3203711169","https://openalex.org/W3206633059","https://openalex.org/W3214803981","https://openalex.org/W4225868495","https://openalex.org/W4310557340","https://openalex.org/W4312310776","https://openalex.org/W4312317310","https://openalex.org/W4312465143","https://openalex.org/W4312574495","https://openalex.org/W4312614039","https://openalex.org/W4312651322","https://openalex.org/W4312873085","https://openalex.org/W4312890493","https://openalex.org/W4313186260","https://openalex.org/W4379619455","https://openalex.org/W4386047824","https://openalex.org/W4386071547","https://openalex.org/W4386076271","https://openalex.org/W4386076331","https://openalex.org/W4386076397","https://openalex.org/W4390872535","https://openalex.org/W4390873628","https://openalex.org/W4390873645","https://openalex.org/W4391128821","https://openalex.org/W4393148276","https://openalex.org/W4393154504","https://openalex.org/W4393159487","https://openalex.org/W4401367304","https://openalex.org/W4402727824","https://openalex.org/W6683161245","https://openalex.org/W6755207826","https://openalex.org/W6757817989","https://openalex.org/W6784333009","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6802517928","https://openalex.org/W6810334672","https://openalex.org/W6811013733","https://openalex.org/W6839546733","https://openalex.org/W6845484831","https://openalex.org/W6846867676","https://openalex.org/W6849177959","https://openalex.org/W6849600822","https://openalex.org/W6849829975","https://openalex.org/W6853406205","https://openalex.org/W6864544085","https://openalex.org/W6867851394","https://openalex.org/W6872866433","https://openalex.org/W6874673442"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W3179968364","https://openalex.org/W1999612375","https://openalex.org/W2938107654","https://openalex.org/W2349784553","https://openalex.org/W3022596247","https://openalex.org/W3196421258","https://openalex.org/W2601444686","https://openalex.org/W4307058054","https://openalex.org/W4404782863"],"abstract_inverted_index":{"Open-vocabulary":[0],"video":[1,8],"visual":[2,9,115,200,203],"relationship":[3,10,88,132,138,150,170,189],"detection":[4,11,86],"aims":[5],"to":[6,37,51,66,68,73,82,154,158,180,188,232,238],"expand":[7],"beyond":[12],"annotated":[13],"categories":[14],"by":[15],"detecting":[16],"unseen":[17,23],"relationships":[18,161],"between":[19,162],"both":[20],"seen":[21],"and":[22,41,87,126,146,205,219],"objects":[24,163],"in":[25],"videos.":[26],"Existing":[27],"methods":[28],"usually":[29],"use":[30],"trajectory":[31,61,85,103,128,135],"detectors":[32,62],"trained":[33],"on":[34,58,214],"closed":[35],"datasets":[36],"detect":[38],"object":[39,70,84,124],"trajectories,":[40],"then":[42],"feed":[43],"these":[44],"trajectories":[45],"into":[46,90,142],"large-scale":[47],"pre-trained":[48,60],"vision-language":[49],"models":[50],"achieve":[52],"open-vocabulary":[53,93,102,123,169],"classification.":[54],"Such":[55],"heavy":[56],"dependence":[57],"the":[59,114,143,156,160,174,222],"limits":[63],"their":[64],"ability":[65],"generalize":[67],"novel":[69,182],"categories,":[71],"leading":[72],"performance":[74],"degradation.":[75],"To":[76,130,184],"address":[77],"this":[78,96],"challenge,":[79],"we":[80,98,166,191],"propose":[81,99,167],"unify":[83],"classification":[89],"an":[91,148,168],"end-to-end":[92],"framework.":[94,226],"Under":[95],"framework,":[97],"a":[100,109,127,137,193,233],"relationship-aware":[101],"detector.":[104],"It":[105],"primarily":[106],"consists":[107],"of":[108,117,178,224],"query-based":[110],"Transformer":[111,144],"decoder,":[112,145],"where":[113],"encoder":[116],"CLIP":[118,179,186],"is":[119,140,152,229],"distilled":[120],"for":[121,202,209],"frame-wise":[122],"detection,":[125,136],"associator.":[129],"exploit":[131],"context":[133],"during":[134],"query":[139],"embedded":[141],"accordingly,":[147],"auxiliary":[149],"loss":[151],"designed":[153],"enable":[155],"decoder":[157],"perceive":[159],"explicitly.":[164],"Moreover,":[165],"classifier":[171],"that":[172,197],"leverages":[173],"rich":[175],"semantic":[176],"knowledge":[177],"discover":[181],"relationships.":[183],"adapt":[185],"well":[187],"classification,":[190],"design":[192],"multi-modal":[194],"prompting":[195,201,208],"method":[196],"employs":[198],"spatio-temporal":[199],"representation":[204],"vision-guided":[206],"language":[207,210],"input.":[211],"Extensive":[212],"experiments":[213],"two":[215],"public":[216],"datasets,":[217],"VidVRD":[218],"VidOR,":[220],"demonstrate":[221,240],"effectiveness":[223],"our":[225],"Our":[227],"framework":[228],"also":[230],"applied":[231],"more":[234],"difficult":[235],"cross-dataset":[236],"scenario":[237],"further":[239],"its":[241],"generalization":[242],"ability.":[243]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
