{"id":"https://openalex.org/W7155484776","doi":"https://doi.org/10.48550/arxiv.2604.21718","title":"Building a Precise Video Language with Human-AI Oversight","display_name":"Building a Precise Video Language with Human-AI Oversight","publication_year":2026,"publication_date":"2026-04-22","ids":{"openalex":"https://openalex.org/W7155484776","doi":"https://doi.org/10.48550/arxiv.2604.21718"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.21718","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21718","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.21718","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5134500407","display_name":"Zhiqiu Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lin, Zhiqiu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089380757","display_name":"Chancharik Mitra","orcid":"https://orcid.org/0009-0008-9826-7534"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mitra, Chancharik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134518927","display_name":"Siyuan Cen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cen, Siyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134546886","display_name":"Isaac Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Isaac","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089274292","display_name":"Yuhan Huang","orcid":"https://orcid.org/0000-0001-9800-8788"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yuhan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134544336","display_name":"Yu Tong Tiffany Ling","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ling, Yu Tong Tiffany","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134506714","display_name":"Hewei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hewei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120779923","display_name":"Irene Pi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pi, Irene","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134499008","display_name":"Shihang Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Shihang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134553030","display_name":"Ryan Rao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rao, Ryan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134548349","display_name":"George Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, George","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134493308","display_name":"Jiaxi Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Jiaxi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134510266","display_name":"Ruojin Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ruojin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134466335","display_name":"Yili Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Yili","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134467907","display_name":"Yilun Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Yilun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5134490527","display_name":"Deva Ramanan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ramanan, Deva","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":16,"corresponding_author_ids":["https://openalex.org/A5134500407"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9508000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9508000016212463,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.02329999953508377,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.005799999926239252,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5157999992370605},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4878999888896942},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.47699999809265137},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4512999951839447},{"id":"https://openalex.org/keywords/cinematography","display_name":"Cinematography","score":0.4507000148296356},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.43070000410079956},{"id":"https://openalex.org/keywords/point","display_name":"Point (geometry)","score":0.4253999888896942},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.414900004863739},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.37549999356269836}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8064000010490417},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5157999992370605},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4878999888896942},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.47699999809265137},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4512999951839447},{"id":"https://openalex.org/C100991257","wikidata":"https://www.wikidata.org/wiki/Q590870","display_name":"Cinematography","level":2,"score":0.4507000148296356},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.43070000410079956},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.4253999888896942},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.41690000891685486},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.414900004863739},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4058000147342682},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.37549999356269836},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.3260999917984009},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3167000114917755},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3158000111579895},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3124000132083893},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.3077000081539154},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3050000071525574},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3019999861717224},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2881999909877777},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.2809000015258789},{"id":"https://openalex.org/C105842133","wikidata":"https://www.wikidata.org/wiki/Q1899679","display_name":"Visual communication","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C56666940","wikidata":"https://www.wikidata.org/wiki/Q788790","display_name":"Documentation","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.27379998564720154},{"id":"https://openalex.org/C166142869","wikidata":"https://www.wikidata.org/wiki/Q60061622","display_name":"Video production","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.25429999828338623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.21718","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21718","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.21718","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.21718","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.7010759115219116,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Video-language":[0],"models":[1,124,169,191],"(VLMs)":[2],"learn":[3],"to":[4,64,100,104,178,195,202,232],"reason":[5],"about":[6],"the":[7,164],"dynamic":[8],"visual":[9,53],"world":[10],"through":[11,134],"natural":[12],"language.":[13],"We":[14],"introduce":[15,69],"a":[16,35,74],"suite":[17],"of":[18,50,89,200,217],"open":[19],"datasets,":[20],"benchmarks,":[21],"and":[22,44,80,94,112,116,131,137,149,187,219,227,236,239],"recipes":[23],"for":[24,38,121],"scalable":[25],"oversight":[26,154,229],"that":[27,143,224],"enable":[28],"precise":[29,225],"video":[30,58,189,234],"captioning.":[31],"First,":[32],"we":[33,68,174],"define":[34],"structured":[36],"specification":[37,226],"describing":[39],"subjects,":[40],"scenes,":[41],"motion,":[42,212],"spatial,":[43],"camera":[45,211],"dynamics,":[46],"grounded":[47],"by":[48,96,152],"hundreds":[49],"carefully":[51],"defined":[52],"primitives":[54],"developed":[55],"with":[56],"professional":[57,181],"creators":[59],"such":[60,170,192],"as":[61,171,193],"filmmakers.":[62],"Next,":[63],"curate":[65],"high-quality":[66],"captions,":[67],"CHAI":[70],"(Critique-based":[71],"Human-AI":[72],"Oversight),":[73],"framework":[75],"where":[76],"trained":[77],"experts":[78],"critique":[79,132,144],"revise":[81],"model-generated":[82],"pre-captions":[83],"into":[84],"improved":[85],"post-captions.":[86],"This":[87],"division":[88],"labor":[90],"improves":[91],"annotation":[92],"accuracy":[93],"efficiency":[95],"offloading":[97],"text":[98],"generation":[99,133,190],"models,":[101],"allowing":[102],"humans":[103],"better":[105,196],"focus":[106],"on":[107,126,243],"verification.":[108],"Additionally,":[109],"these":[110],"critiques":[111],"preferences":[113],"between":[114],"pre-":[115],"post-captions":[117],"provide":[118],"rich":[119],"supervision":[120],"improving":[122],"open-source":[123],"(Qwen3-VL)":[125],"caption":[127],"generation,":[128],"reward":[129],"modeling,":[130],"SFT,":[135],"DPO,":[136],"inference-time":[138],"scaling.":[139],"Our":[140,221],"ablations":[141],"show":[142,223],"quality":[145],"in":[146],"precision,":[147],"recall,":[148],"constructiveness,":[150],"ensured":[151],"our":[153,176,244],"framework,":[155],"directly":[156],"governs":[157],"downstream":[158],"performance.":[159],"With":[160],"modest":[161],"expert":[162],"supervision,":[163],"resulting":[165],"model":[166],"outperforms":[167],"closed-source":[168],"Gemini-3.1-Pro.":[172],"Finally,":[173],"apply":[175],"approach":[177],"re-caption":[179],"large-scale":[180],"videos":[182],"(e.g.,":[183],"films,":[184],"commercials,":[185],"games)":[186],"fine-tune":[188],"Wan":[194],"follow":[197],"detailed":[198],"prompts":[199],"up":[201],"400":[203],"words,":[204],"achieving":[205],"finer":[206],"control":[207],"over":[208],"cinematography":[209],"including":[210],"angle,":[213],"lens,":[214],"focus,":[215],"point":[216],"view,":[218],"framing.":[220],"results":[222],"human-AI":[228],"are":[230,241],"key":[231],"professional-level":[233],"understanding":[235],"generation.":[237],"Data":[238],"code":[240],"available":[242],"project":[245],"page:":[246],"https://linzhiqiu.github.io/papers/chai/":[247]},"counts_by_year":[],"updated_date":"2026-04-25T06:06:54.107920","created_date":"2026-04-25T00:00:00"}
