{"id":"https://openalex.org/W7137882842","doi":"https://doi.org/10.1609/aaai.v40i8.37609","title":"Object-Centric Framework for Video Moment Retrieval","display_name":"Object-Centric Framework for Video Moment Retrieval","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7137882842","doi":"https://doi.org/10.1609/aaai.v40i8.37609"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i8.37609","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37609","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i8.37609","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016271188","display_name":"Zongyao Li","orcid":"https://orcid.org/0000-0002-3300-1806"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zongyao Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020006712","display_name":"Yongkang Wong","orcid":"https://orcid.org/0000-0002-1239-4428"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yongkang Wong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129709066","display_name":"Satoshi Yamazaki","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Satoshi Yamazaki","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129692811","display_name":"Jianquan Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jianquan Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129700264","display_name":"Mohan S. Kankanhalli","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mohan Kankanhalli","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5016271188"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.1037196,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"8","first_page":"6771","last_page":"6779"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8899000287055969,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.8899000287055969,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.033799998462200165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.019700000062584877,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/encode","display_name":"ENCODE","score":0.7900999784469604},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.565500020980835},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5544999837875366},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.548799991607666},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.5372999906539917},{"id":"https://openalex.org/keywords/moment","display_name":"Moment (physics)","score":0.5012999773025513},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4375999867916107},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.3935999870300293}],"concepts":[{"id":"https://openalex.org/C66746571","wikidata":"https://www.wikidata.org/wiki/Q1134833","display_name":"ENCODE","level":3,"score":0.7900999784469604},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7889000177383423},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5957000255584717},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.565500020980835},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5544999837875366},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.548799991607666},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.5372999906539917},{"id":"https://openalex.org/C179254644","wikidata":"https://www.wikidata.org/wiki/Q13222844","display_name":"Moment (physics)","level":2,"score":0.5012999773025513},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4375999867916107},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.3935999870300293},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.38830000162124634},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.38670000433921814},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.383899986743927},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.37459999322891235},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3337000012397766},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.31869998574256897},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C179372163","wikidata":"https://www.wikidata.org/wiki/Q1406181","display_name":"Scene graph","level":3,"score":0.302700012922287},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2870999872684479},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.2815000116825104},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.27630001306533813},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2662999927997589},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i8.37609","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37609","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i8.37609","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i8.37609","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Most":[0],"existing":[1,67,184],"video":[2,105],"moment":[3,86],"retrieval":[4],"methods":[5,186],"rely":[6],"on":[7,115,170],"temporal":[8,53],"sequences":[9,123,132],"of":[10,66,160],"frame-":[11],"or":[12],"clip-level":[13],"features":[14],"that":[15,124,180],"primarily":[16],"encode":[17,125],"global":[18],"visual":[19,127],"and":[20,33,48,99,111,128,175],"semantic":[21,129],"information.":[22,130],"However,":[23],"such":[24],"representations":[25],"often":[26],"fail":[27],"to":[28,107],"capture":[29],"fine-grained":[30],"object":[31,57],"semantics":[32],"appearance,":[34],"which":[35,140],"are":[36,133],"crucial":[37],"for":[38,85],"localizing":[39],"moments":[40,161],"described":[41],"by":[42,135],"object-oriented":[43,164],"queries":[44],"involving":[45],"specific":[46],"entities":[47],"their":[49,112],"interactions.":[50],"In":[51],"particular,":[52],"dynamics":[54],"at":[55],"the":[56,64,116],"level":[58],"have":[59],"been":[60],"largely":[61],"overlooked,":[62],"limiting":[63],"effectiveness":[65],"approaches":[68],"in":[69],"scenarios":[70],"requiring":[71],"detailed":[72],"object-level":[73,121,151],"reasoning.":[74],"To":[75],"address":[76],"this":[77],"limitation,":[78],"we":[79,119],"propose":[80],"a":[81,95,136],"novel":[82],"object-centric":[83],"framework":[84,155],"retrieval.":[87],"Our":[88],"method":[89,169,182],"first":[90],"extracts":[91],"query-relevant":[92],"objects":[93,110,145],"using":[94],"scene":[96,102,117],"graph":[97],"parser":[98],"then":[100],"generates":[101],"graphs":[103],"from":[104],"frames":[106],"represent":[108],"these":[109],"relationships.":[113],"Based":[114],"graphs,":[118],"construct":[120],"feature":[122],"rich":[126],"These":[131],"processed":[134],"relational":[137],"tracklet":[138],"transformer,":[139],"models":[141],"spatio-temporal":[142],"correlations":[143],"among":[144],"over":[146],"time.":[147],"By":[148],"explicitly":[149],"capturing":[150],"state":[152],"changes,":[153],"our":[154,168,181],"enables":[156],"more":[157],"accurate":[158],"localization":[159],"aligned":[162],"with":[163],"queries.":[165],"We":[166],"evaluated":[167],"three":[171],"benchmarks:":[172],"Charades-STA,":[173],"QVHighlights,":[174],"TACoS.":[176],"Experimental":[177],"results":[178],"demonstrate":[179],"outperforms":[183],"state-of-the-art":[185],"across":[187],"all":[188],"benchmarks.":[189]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-18T00:00:00"}
