{"id":"https://openalex.org/W7138148420","doi":"https://doi.org/10.1609/aaai.v40i8.37551","title":"TubeRMC: Tube-conditioned Reconstruction with Mutual Constraints for Weakly-supervised Spatio-Temporal Video Grounding","display_name":"TubeRMC: Tube-conditioned Reconstruction with Mutual Constraints for Weakly-supervised Spatio-Temporal Video Grounding","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138148420","doi":"https://doi.org/10.1609/aaai.v40i8.37551"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i8.37551","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37551","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i8.37551","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120909329","display_name":"Jinxuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jinxuan Li","raw_affiliation_strings":["SUN YAT-SEN UNIVERSITY"],"affiliations":[{"raw_affiliation_string":"SUN YAT-SEN UNIVERSITY","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129751847","display_name":"Yi Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi Zhang","raw_affiliation_strings":["SUN YAT-SEN UNIVERSITY"],"affiliations":[{"raw_affiliation_string":"SUN YAT-SEN UNIVERSITY","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129728358","display_name":"Jian-Fang Hu","orcid":null},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian-Fang Hu","raw_affiliation_strings":["SUN YAT-SEN UNIVERSITY\nGuangdong Province Key Laboratory of Information Security Technology, China\nKey Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China"],"affiliations":[{"raw_affiliation_string":"SUN YAT-SEN UNIVERSITY\nGuangdong Province Key Laboratory of Information Security Technology, China\nKey Laboratory of Machine Intelligence and Advanced Computing, Ministry of Education, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091202600","display_name":"Chaolei Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Chaolei Tan","raw_affiliation_strings":["The Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129742852","display_name":"Tianming Liang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tianming Liang","raw_affiliation_strings":["SUN YAT-SEN UNIVERSITY"],"affiliations":[{"raw_affiliation_string":"SUN YAT-SEN UNIVERSITY","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129695020","display_name":"Beihao Xia","orcid":null},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Beihao Xia","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5120909329"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.46567164,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"8","first_page":"6253","last_page":"6261"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9882000088691711,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9882000088691711,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.0034000000450760126,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.0010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/identification","display_name":"Identification (biology)","score":0.6266999840736389},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.6193000078201294},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5864999890327454},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.5554999709129333},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.524399995803833},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4097000062465668},{"id":"https://openalex.org/keywords/ground","display_name":"Ground","score":0.38839998841285706},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.3806000053882599}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7861999869346619},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6301000118255615},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.6266999840736389},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.6193000078201294},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5864999890327454},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.5554999709129333},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.524399995803833},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.511900007724762},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4097000062465668},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.38839998841285706},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.3806000053882599},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.37700000405311584},{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.361299991607666},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.33340001106262207},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2948000133037567},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29179999232292175},{"id":"https://openalex.org/C141379421","wikidata":"https://www.wikidata.org/wiki/Q6094427","display_name":"Iterative reconstruction","level":2,"score":0.29120001196861267},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2833999991416931},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2736000120639801},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.257099986076355}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i8.37551","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37551","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i8.37551","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37551","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Spatio-Temporal":[0],"Video":[1],"Grounding":[2],"(STVG)":[3],"aims":[4],"to":[5,12,43,127,146,164],"localize":[6],"a":[7,13,23,59,87,138],"spatio-temporal":[8,113,125,142],"tube":[9],"that":[10,95,184],"corresponds":[11],"given":[14],"language":[15],"query":[16],"in":[17,41,73,151],"an":[18],"untrimmed":[19],"video.":[20],"This":[21],"is":[22,135],"challenging":[24],"task":[25],"since":[26],"it":[27],"involves":[28],"complex":[29],"vision-language":[30],"understanding":[31],"and":[32,77,105,124,161,179,192],"spatiotemporal":[33],"reasoning.":[34],"Recent":[35],"works":[36],"have":[37],"explored":[38],"weakly-supervised":[39],"setting":[40],"STVG":[42],"eliminate":[44],"reliance":[45],"on":[46,174],"fine-grained":[47],"annotations":[48],"like":[49],"bounding":[50],"boxes":[51],"or":[52],"temporal":[53,162],"stamps.":[54],"However,":[55],"they":[56],"typically":[57],"follow":[58],"simple":[60],"late-fusion":[61],"manner,":[62],"which":[63],"generates":[64,96],"tubes":[65,99,143],"independent":[66],"of":[67],"the":[68,148,152],"text":[69],"description,":[70],"often":[71],"resulting":[72],"failed":[74],"target":[75,79,189],"identification":[76,190],"inconsistent":[78,193],"tracking.":[80,194],"To":[81],"address":[82],"this":[83],"limitation,":[84],"we":[85,116],"propose":[86],"Tube-conditioned":[88,139],"Reconstruction":[89],"with":[90,100,112,137],"Mutual":[91],"Constraints":[92],"(TubeRMC)":[93],"framework":[94],"text-conditioned":[97],"candidate":[98],"pre-trained":[101],"visual":[102],"grounding":[103],"models":[104],"further":[106,155],"refine":[107],"them":[108],"via":[109],"tube-conditioned":[110],"reconstruction":[111,119],"constraints.":[114],"Specifically,":[115],"design":[117],"three":[118],"strategies":[120],"from":[121],"temporal,":[122],"spatial,":[123],"perspectives":[126],"comprehensively":[128],"capture":[129],"rich":[130],"tube-text":[131],"correspondences.":[132],"Each":[133],"strategy":[134],"equipped":[136],"Reconstructor,":[140],"utilizing":[141],"as":[144],"condition":[145],"reconstruct":[147],"key":[149],"clues":[150],"query.":[153],"We":[154],"introduce":[156],"mutual":[157],"constraints":[158],"between":[159],"spatial":[160],"proposals":[163],"enhance":[165],"their":[166],"quality":[167],"for":[168],"reconstruction.":[169],"TubeRMC":[170,185],"outperforms":[171],"existing":[172],"methods":[173],"two":[175],"public":[176],"benchmarks":[177],"VidSTG":[178],"HCSTVG.":[180],"Further":[181],"visualization":[182],"shows":[183],"effectively":[186],"mitigates":[187],"both":[188],"errors":[191]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
