{"id":"https://openalex.org/W7134807863","doi":"https://doi.org/10.48550/arxiv.2603.06732","title":"HERO: Hierarchical Embedding-Refinement for Open-Vocabulary Temporal Sentence Grounding in Videos","display_name":"HERO: Hierarchical Embedding-Refinement for Open-Vocabulary Temporal Sentence Grounding in Videos","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134807863","doi":"https://doi.org/10.48550/arxiv.2603.06732"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.06732","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128669461","display_name":"Tingting Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Han, Tingting","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128685791","display_name":"Xinsong Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Xinsong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128647511","display_name":"Yufei Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Yufei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128654511","display_name":"Min Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Min","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128631403","display_name":"Sicheng Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Sicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128634488","display_name":"Zhou Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Zhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128669461"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9804999828338623,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9804999828338623,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.005900000222027302,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.0017999999690800905,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.7267000079154968},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6007999777793884},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.5716999769210815},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5353999733924866},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5324000120162964},{"id":"https://openalex.org/keywords/bridge","display_name":"Bridge (graph theory)","score":0.49549999833106995},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.48249998688697815},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4616999924182892}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7889999747276306},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.7267000079154968},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6189000010490417},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6007999777793884},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5770000219345093},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.5716999769210815},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5353999733924866},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5324000120162964},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.49549999833106995},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.48249998688697815},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4616999924182892},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.40450000762939453},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.3846000134944916},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.3774999976158142},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.3747999966144562},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.3666999936103821},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3239000141620636},{"id":"https://openalex.org/C168993435","wikidata":"https://www.wikidata.org/wiki/Q6501125","display_name":"Ground","level":2,"score":0.3066999912261963},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.2939999997615814},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.25769999623298645}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.06732","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.06732","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.06732","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.06732","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7452864646911621}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Temporal":[0],"Sentence":[1],"Grounding":[2],"in":[3],"Videos":[4],"(TSGV)":[5],"aims":[6],"to":[7,16,36,38],"temporally":[8],"localize":[9],"segments":[10],"of":[11,79,156],"a":[12,17,96,159],"video":[13],"that":[14,99,137],"correspond":[15],"given":[18],"natural":[19],"language":[20],"query.":[21],"Despite":[22],"recent":[23],"progress,":[24],"most":[25],"existing":[26],"TSGV":[27,56],"approaches":[28],"operate":[29],"under":[30,144],"closed-vocabulary":[31],"settings,":[32],"limiting":[33],"their":[34],"ability":[35],"generalize":[37],"real-world":[39],"queries":[40],"involving":[41],"novel":[42],"or":[43],"diverse":[44],"linguistic":[45,102],"expressions.":[46],"To":[47,86],"bridge":[48],"this":[49],"critical":[50],"gap,":[51],"we":[52,89],"introduce":[53],"the":[54,61,154],"Open-Vocabulary":[55,94],"(OV-TSGV)":[57],"task":[58],"and":[59,65,71,104,114,122,132,152],"construct":[60],"first":[62],"dedicated":[63],"benchmarks--Charades-OV":[64],"ActivityNet-OV--that":[66],"simulate":[67],"realistic":[68],"vocabulary":[69,134],"shifts":[70],"paraphrastic":[72],"variations.":[73],"These":[74],"benchmarks":[75,135],"facilitate":[76],"systematic":[77],"evaluation":[78],"model":[80],"generalization":[81,150],"beyond":[82],"seen":[83],"training":[84],"concepts.":[85],"tackle":[87],"OV-TSGV,":[88],"propose":[90],"HERO(Hierarchical":[91],"Embedding-Refinement":[92],"for":[93],"grounding),":[95],"unified":[97],"framework":[98],"leverages":[100],"hierarchical":[101],"embeddings":[103],"performs":[105],"parallel":[106],"cross-modal":[107],"refinement.":[108,126],"HERO":[109,138],"jointly":[110],"models":[111],"multi-level":[112],"semantics":[113],"enhances":[115],"video-language":[116],"alignment":[117],"via":[118],"semantic-guided":[119],"visual":[120],"filtering":[121],"contrastive":[123],"masked":[124],"text":[125],"Extensive":[127],"experiments":[128],"on":[129],"both":[130],"standard":[131],"open":[133],"demonstrate":[136],"consistently":[139],"surpasses":[140],"state-of-the-art":[141],"methods,":[142],"particularly":[143],"open-vocabulary":[145],"scenarios,":[146],"validating":[147],"its":[148],"strong":[149],"capability":[151],"underscoring":[153],"significance":[155],"OV-TSGV":[157],"as":[158],"new":[160],"research":[161],"direction.":[162]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-11T00:00:00"}
