{"id":"https://openalex.org/W7137880032","doi":"https://doi.org/10.1609/aaai.v40i12.37956","title":"SpaceVLLM: Endowing Multimodal Large Language Model with Spatio-Temporal Video Grounding Capability","display_name":"SpaceVLLM: Endowing Multimodal Large Language Model with Spatio-Temporal Video Grounding Capability","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137880032","doi":"https://doi.org/10.1609/aaai.v40i12.37956"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i12.37956","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37956","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i12.37956","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129677778","display_name":"Jiankang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jiankang Wang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129681651","display_name":"Zhihan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhihan Zhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129649769","display_name":"Zhihang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhihang Liu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129748007","display_name":"Yang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang Li","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008458929","display_name":"Jiannan Ge","orcid":"https://orcid.org/0000-0002-2580-9055"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiannan Ge","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129640945","display_name":"Hongtao Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hongtao Xie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129716223","display_name":"Yongdong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yongdong Zhang","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5129677778"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11260254,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"12","first_page":"9912","last_page":"9920"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.0015999999595806003,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.00139999995008111,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.6442000269889832},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6363999843597412},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5752000212669373},{"id":"https://openalex.org/keywords/minimum-bounding-box","display_name":"Minimum bounding box","score":0.5217999815940857},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.5174000263214111},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.48010000586509705},{"id":"https://openalex.org/keywords/motion","display_name":"Motion (physics)","score":0.46959999203681946},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4530999958515167}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8535000085830688},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.6442000269889832},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6363999843597412},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5752000212669373},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.553600013256073},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.5217999815940857},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.5174000263214111},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.48010000586509705},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.46959999203681946},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4625999927520752},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4530999958515167},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3862000107765198},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.3675999939441681},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.3361999988555908},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.32690000534057617},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2987000048160553},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.28279998898506165},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.2824000120162964},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2563000023365021},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.25459998846054077}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i12.37956","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37956","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i12.37956","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i12.37956","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"have":[5],"shown":[6],"remarkable":[7],"progress":[8],"in":[9,53,124,169],"temporal":[10],"or":[11],"spatial":[12,146],"localization":[13],"tasks,":[14,207],"but":[15],"struggle":[16],"with":[17,63,105,117],"joint":[18],"spatio-temporal":[19,106],"video":[20,66,107,118,163,205],"grounding":[21,108],"(STVG).":[22],"We":[23,133],"identify":[24],"two":[25],"key":[26],"bottlenecks":[27],"hindering":[28],"this":[29,97,170],"capability:":[30],"(1)":[31],"the":[32,87,122,149,158,193],"sheer":[33],"number":[34],"of":[35,50,90],"visual":[36,42],"tokens":[37],"makes":[38,55],"long-range":[39],"and":[40,92,129,156,199],"fine-grained":[41],"modeling":[43],"challenging;":[44],"(2)":[45],"generating":[46],"a":[47,81,102,136],"long":[48],"sequence":[49],"bounding":[51],"boxes":[52],"text":[54],"it":[56],"hard":[57],"to":[58,120,144,160,179],"accurately":[59],"align":[60],"each":[61],"box":[62],"its":[64],"specific":[65],"frame.":[67],"Distinct":[68],"from":[69],"prior":[70],"efforts":[71],"that":[72,85,141,190],"rely":[73],"on":[74,162,196,203],"attaching":[75],"complex":[76],"modules,":[77],"we":[78,99,111,172],"argue":[79],"for":[80,151],"more":[82],"elegant":[83],"paradigm":[84],"unlocks":[86],"inherent":[88],"potential":[89],"MLLMs":[91],"leverages":[93],"their":[94],"strengths.":[95],"To":[96,165],"end,":[98],"propose":[100,112,173],"\\textbf{\\textit{SpaceVLLM}},":[101],"MLLM":[103,123,159],"equipped":[104],"capabilities.":[109],"Specifically,":[110],"Spatio-Temporal":[113],"Aware":[114],"Queries,":[115],"interleaved":[116],"frames,":[119],"guide":[121],"capturing":[125],"both":[126],"static":[127],"appearance":[128],"dynamic":[130],"motion":[131],"features.":[132],"further":[134,166],"present":[135],"lightweight":[137],"Query-Guided":[138],"Space":[139],"Head":[140],"maps":[142],"queries":[143],"precise":[145],"coordinates,":[147],"bypassing":[148],"need":[150],"direct":[152],"textual":[153],"coordinate":[154],"generation":[155],"enabling":[157],"focus":[161],"understanding.":[164],"facilitate":[167],"research":[168],"area,":[171],"an":[174],"automated":[175],"data":[176],"synthesis":[177],"pipeline":[178],"construct":[180],"\\textbf{V-STG}":[181],"dataset,":[182],"comprising":[183],"110K":[184],"STVG":[185,197],"instances.":[186],"Extensive":[187],"experiments":[188],"show":[189],"\\textit{SpaceVLLM}":[191],"achieves":[192],"state-of-the-art":[194],"performance":[195,202],"benchmarks":[198],"maintains":[200],"strong":[201],"various":[204],"understanding":[206],"validating":[208],"our":[209],"approach's":[210],"effectiveness.":[211]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
