{"id":"https://openalex.org/W4403885232","doi":"https://doi.org/10.48550/arxiv.2410.02768","title":"Uncertainty-Guided Self-Questioning and Answering for Video-Language Alignment","display_name":"Uncertainty-Guided Self-Questioning and Answering for Video-Language Alignment","publication_year":2024,"publication_date":"2024-09-17","ids":{"openalex":"https://openalex.org/W4403885232","doi":"https://doi.org/10.48550/arxiv.2410.02768"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.02768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02768","pdf_url":"https://arxiv.org/pdf/2410.02768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.02768","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108988769","display_name":"Jin Chen","orcid":"https://orcid.org/0009-0008-2301-2465"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104271836","display_name":"Kaijing Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Kaijing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067960278","display_name":"Haojian Huang","orcid":"https://orcid.org/0000-0002-0661-712X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Haojian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101620682","display_name":"Fang Han","orcid":"https://orcid.org/0000-0002-1937-4710"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Han","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058002649","display_name":"Hao Sun","orcid":"https://orcid.org/0000-0001-8094-1991"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Hosseinzadeh, Mehdi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hosseinzadeh, Mehdi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Liu, Zhe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9764000177383423,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bootstrapping","display_name":"Bootstrapping (finance)","score":0.8745232820510864},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.6461057066917419},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6159005761146545},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.534024715423584},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.36620742082595825},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.15343564748764038},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.10555514693260193}],"concepts":[{"id":"https://openalex.org/C207609745","wikidata":"https://www.wikidata.org/wiki/Q4944086","display_name":"Bootstrapping (finance)","level":2,"score":0.8745232820510864},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.6461057066917419},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6159005761146545},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.534024715423584},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36620742082595825},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.15343564748764038},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.10555514693260193}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.02768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02768","pdf_url":"https://arxiv.org/pdf/2410.02768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.02768","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.02768","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.02768","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02768","pdf_url":"https://arxiv.org/pdf/2410.02768","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4403885232.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W1534274833","https://openalex.org/W3117246195","https://openalex.org/W156620619","https://openalex.org/W2098233217","https://openalex.org/W2914363205","https://openalex.org/W2616249226","https://openalex.org/W3204019825"],"abstract_inverted_index":{"The":[0,207],"development":[1],"of":[2,40,55,93,115,142,156,197],"multi-modal":[3],"models":[4],"has":[5],"been":[6],"rapidly":[7],"advancing,":[8],"with":[9],"some":[10],"demonstrating":[11],"remarkable":[12],"capabilities.":[13],"However,":[14,101],"annotating":[15],"video-text":[16],"pairs":[17],"remains":[18],"expensive":[19],"and":[20,34,50,81,89,138,186,190,201],"insufficient.":[21],"Take":[22],"video":[23,87],"question":[24,73],"answering":[25],"(VideoQA)":[26],"tasks":[27],"as":[28,117],"an":[29],"example,":[30],"human":[31],"annotated":[32],"questions":[33,104,144],"answers":[35],"often":[36,48],"cover":[37],"only":[38],"part":[39],"the":[41,44,90,108,112,140,147,151,154,162,198,202],"video,":[42],"since":[43],"corresponding":[45],"text":[46],"is":[47,161],"short":[49],"monotonous,":[51],"leading":[52],"to":[53,97,135,164],"underutilization":[54],"video.":[56],"To":[57,124,153],"address":[58],"this,":[59],"we":[60,118,129,193],"propose":[61],"a":[62,68],"Bootstrapping":[63],"Video-Language":[64],"Alignment":[65],"framework":[66,200],"(BoViLA),":[67],"self-training":[69,167,199],"method":[70],"that":[71],"augments":[72],"samples":[74],"during":[75],"training":[76],"process":[77],"through":[78],"LLM-based":[79,166],"self-questioning":[80],"answering,":[82],"which":[83],"help":[84],"model":[85],"exploit":[86],"information":[88],"internal":[91],"knowledge":[92],"LLMs":[94],"more":[95],"thoroughly":[96],"improve":[98],"modality":[99,148,170],"alignment.":[100,171],"low-quality":[102],"self-generated":[103,127,143],"may":[105],"instead":[106],"contaminate":[107],"performance,":[109],"especially":[110],"in":[111,121],"early":[113],"stages":[114],"training,":[116],"have":[119],"observed":[120],"our":[122,157],"experiments.":[123],"filter":[125],"bad":[126],"questions,":[128],"introduce":[130],"Evidential":[131],"Deep":[132],"Learning":[133],"(EDL)":[134],"estimate":[136],"uncertainty":[137,204],"assess":[139],"quality":[141],"by":[145],"evaluating":[146],"alignment":[149],"within":[150],"context.":[152],"best":[155],"knowledge,":[158],"this":[159],"work":[160],"first":[163],"explore":[165],"frameworks":[168],"for":[169],"We":[172],"evaluate":[173],"BoViLA":[174],"on":[175],"five":[176],"strong":[177],"VideoQA":[178],"benchmarks,":[179],"where":[180],"it":[181],"outperforms":[182],"several":[183],"state-of-the-art":[184],"methods":[185],"demonstrate":[187],"its":[188],"effectiveness":[189],"generality.":[191],"Additionally,":[192],"provide":[194],"extensive":[195],"analyses":[196],"EDL-based":[203],"filtering":[205],"mechanism.":[206],"code":[208],"will":[209],"be":[210],"made":[211],"available.":[212]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2024-10-31T00:00:00"}
