{"id":"https://openalex.org/W4403899069","doi":"https://doi.org/10.48550/arxiv.2410.03290","title":"Grounded-VideoLLM: Sharpening Fine-grained Temporal Grounding in Video Large Language Models","display_name":"Grounded-VideoLLM: Sharpening Fine-grained Temporal Grounding in Video Large Language Models","publication_year":2024,"publication_date":"2024-10-04","ids":{"openalex":"https://openalex.org/W4403899069","doi":"https://doi.org/10.48550/arxiv.2410.03290"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.03290","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.03290","pdf_url":"https://arxiv.org/pdf/2410.03290","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.03290","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008475639","display_name":"Haibo Wang","orcid":"https://orcid.org/0000-0002-7514-1719"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Haibo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113079567","display_name":"Zhiyang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Zhiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100901892","display_name":"Yu Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114462435","display_name":"Shizhe Diao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diao, Shizhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087777496","display_name":"Yufan Zhou","orcid":"https://orcid.org/0000-0001-7188-3072"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yufan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101488072","display_name":"Yixin Cao","orcid":"https://orcid.org/0000-0002-1632-7812"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yixin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055007304","display_name":"Qifan Wang","orcid":"https://orcid.org/0000-0002-5304-7975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Qifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101647216","display_name":"Weifeng Ge","orcid":"https://orcid.org/0009-0000-6627-5101"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Weifeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101307306","display_name":"Lifu Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Lifu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5008475639"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9912999868392944,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9782000184059143,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.965499997138977,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sharpening","display_name":"Sharpening","score":0.9435888528823853},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5961971282958984},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.28343647718429565}],"concepts":[{"id":"https://openalex.org/C2781137444","wikidata":"https://www.wikidata.org/wiki/Q237105","display_name":"Sharpening","level":2,"score":0.9435888528823853},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5961971282958984},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.28343647718429565}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.03290","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.03290","pdf_url":"https://arxiv.org/pdf/2410.03290","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.03290","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.03290","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.03290","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.03290","pdf_url":"https://arxiv.org/pdf/2410.03290","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4403899069.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2329932281","https://openalex.org/W64535957","https://openalex.org/W2043790407","https://openalex.org/W2348902545","https://openalex.org/W2321247741","https://openalex.org/W2136313643","https://openalex.org/W2087564795"],"abstract_inverted_index":{"Video":[0],"Large":[1],"Language":[2],"Models":[3],"(Video-LLMs)":[4],"have":[5,47],"demonstrated":[6],"remarkable":[7],"capabilities":[8],"in":[9,38,150],"coarse-grained":[10],"video":[11,36,51,116,160,173,177],"understanding,":[12],"however,":[13],"they":[14,54],"struggle":[15],"with":[16,89,109],"fine-grained":[17,40,50,151],"temporal":[18,57,75,86,117,127,156],"grounding.":[19],"In":[20,62],"this":[21],"paper,":[22],"we":[23,66,102,130],"introduce":[24],"Grounded-VideoLLM,":[25,101],"a":[26,39,104,133,171],"novel":[27],"Video-LLM":[28],"adept":[29],"at":[30],"perceiving":[31],"and":[32,59,83,113,162],"reasoning":[33,128],"over":[34],"specific":[35,90],"moments":[37],"manner.":[41],"We":[42],"identify":[43],"that":[44,145],"current":[45],"Video-LLMs":[46],"limitations":[48],"for":[49,175],"understanding":[52],"since":[53],"lack":[55],"effective":[56],"modeling":[58],"timestamp":[60],"representation.":[61],"light":[63],"of":[64,100,120],"this,":[65],"sharpen":[67],"our":[68],"model":[69],"by":[70,137],"incorporating":[71],"(1)":[72],"an":[73,138],"additional":[74],"stream":[76],"to":[77,93],"encode":[78],"the":[79,98],"relationships":[80],"between":[81],"frames":[82],"(2)":[84],"discrete":[85],"tokens":[87],"enriched":[88],"time":[91],"knowledge":[92],"represent":[94],"timestamps.":[95],"To":[96,123],"optimize":[97],"training":[99,106],"employ":[103],"multi-stage":[105],"scheme,":[107],"beginning":[108],"simple":[110],"video-captioning":[111],"tasks":[112,119,153],"progressively":[114],"introducing":[115],"grounding":[118,152],"increasing":[121],"complexity.":[122],"further":[124],"enhance":[125],"Grounded-VideoLLM's":[126],"capability,":[129],"also":[131,166],"curate":[132],"grounded":[134,163],"VideoQA":[135],"dataset":[136],"automatic":[139],"annotation":[140],"pipeline.":[141],"Extensive":[142],"experiments":[143],"demonstrate":[144],"Grounded-VideoLLM":[146],"not":[147],"only":[148],"excels":[149],"such":[154],"as":[155,170],"sentence":[157],"grounding,":[158],"dense":[159],"captioning,":[161],"VideoQA,":[164],"but":[165],"shows":[167],"great":[168],"potential":[169],"versatile":[172],"assistant":[174],"general":[176],"understanding.":[178]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
