{"id":"https://openalex.org/W4304098310","doi":"https://doi.org/10.1145/3503161.3548291","title":"AVQA: A Dataset for Audio-Visual Question Answering on Videos","display_name":"AVQA: A Dataset for Audio-Visual Question Answering on Videos","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304098310","doi":"https://doi.org/10.1145/3503161.3548291"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3548291","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548291","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548291","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548291","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071793366","display_name":"Pinci Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pinci Yang","raw_affiliation_strings":["Tsinghua University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022927606","display_name":"Xin Wang","orcid":"https://orcid.org/0000-0002-0351-2939"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xin Wang","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028258340","display_name":"Xuguang Duan","orcid":"https://orcid.org/0000-0001-9108-9618"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuguang Duan","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100420416","display_name":"Hong Chen","orcid":"https://orcid.org/0000-0002-0943-2286"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Chen","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024495682","display_name":"Runze Hou","orcid":"https://orcid.org/0000-0002-9697-7996"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Runze Hou","raw_affiliation_strings":["Tsinghua University, Shenzhen, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Shenzhen, China","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068772612","display_name":"Cong Jin","orcid":"https://orcid.org/0000-0003-0464-9862"},"institutions":[{"id":"https://openalex.org/I75689368","display_name":"Communication University of China","ror":"https://ror.org/04facbs33","country_code":"CN","type":"education","lineage":["https://openalex.org/I75689368"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cong Jin","raw_affiliation_strings":["Communication University of China, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Communication University of China, Beijing, China","institution_ids":["https://openalex.org/I75689368"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100339293","display_name":"Wenwu Zhu","orcid":"https://orcid.org/0000-0003-2236-9290"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenwu Zhu","raw_affiliation_strings":["Tsinghua University, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Tsinghua University, Beijing, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.6541,"has_fulltext":true,"cited_by_count":51,"citation_normalized_percentile":{"value":0.93142614,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"3480","last_page":"3491"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9909999966621399,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8360185623168945},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.7769243717193604},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.6775338053703308},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.631335973739624},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6040173768997192},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.581453263759613},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.5232585668563843},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.49572694301605225},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4155055582523346},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.33478844165802},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.27087825536727905},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.12221083045005798}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8360185623168945},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.7769243717193604},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.6775338053703308},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.631335973739624},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6040173768997192},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.581453263759613},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.5232585668563843},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.49572694301605225},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4155055582523346},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.33478844165802},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.27087825536727905},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.12221083045005798},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3548291","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548291","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548291","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3503161.3548291","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3503161.3548291","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3503161.3548291","source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G119957897","display_name":null,"funder_award_id":"62102222","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1527691513","display_name":null,"funder_award_id":"62250008","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2994122393","display_name":null,"funder_award_id":"No. 62250008, No. 62102222.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7084459132","display_name":null,"funder_award_id":"No. 62250008, No. 62102222","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7340897647","display_name":null,"funder_award_id":"No. 2018AAA0102001","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G8273014382","display_name":null,"funder_award_id":"2018AAA0102001","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4304098310.pdf","grobid_xml":"https://content.openalex.org/works/W4304098310.grobid-xml"},"referenced_works_count":34,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2549139847","https://openalex.org/W2606982687","https://openalex.org/W2735159761","https://openalex.org/W2751525844","https://openalex.org/W2765716052","https://openalex.org/W2904452845","https://openalex.org/W2954199749","https://openalex.org/W2962934715","https://openalex.org/W2962949233","https://openalex.org/W2963890755","https://openalex.org/W2964220823","https://openalex.org/W2964306921","https://openalex.org/W2969127500","https://openalex.org/W2971923036","https://openalex.org/W2981582341","https://openalex.org/W2997344006","https://openalex.org/W2997789966","https://openalex.org/W2998166190","https://openalex.org/W3015371781","https://openalex.org/W3016658915","https://openalex.org/W3034730770","https://openalex.org/W3093274101","https://openalex.org/W3119243803","https://openalex.org/W3122622502","https://openalex.org/W3159630763","https://openalex.org/W3175961224","https://openalex.org/W3187433838","https://openalex.org/W4206227781","https://openalex.org/W4212774754","https://openalex.org/W4214604251","https://openalex.org/W4237040408","https://openalex.org/W4312761939","https://openalex.org/W4312974690"],"related_works":["https://openalex.org/W73545470","https://openalex.org/W4224266612","https://openalex.org/W2383394264","https://openalex.org/W4320153225","https://openalex.org/W4293261942","https://openalex.org/W3125968744","https://openalex.org/W203959209","https://openalex.org/W2110287964","https://openalex.org/W2167701463","https://openalex.org/W4307407935"],"abstract_inverted_index":{"Audio-visual":[0],"question":[1,94,176,181,204],"answering":[2,95,177,205],"aims":[3],"to":[4,66,81,140,155],"answer":[5],"questions":[6,49],"regarding":[7],"both":[8,119],"audio":[9,57,63],"and":[10,17,73,110,148,151],"visual":[11,52],"modalities":[12,150,161],"in":[13,23,39,99,124,206],"a":[14,91,125,135,196],"given":[15],"video,":[16],"has":[18],"drawn":[19],"increasing":[20],"research":[21],"interest":[22],"recent":[24],"years.":[25],"However,":[26],"there":[27],"have":[28],"been":[29],"no":[30],"appropriate":[31],"datasets":[32],"for":[33,190],"this":[34,79],"challenging":[35],"task":[36],"on":[37,97,116,162,202],"videos":[38,72,74,98,105],"real-life":[40,100,207],"scenarios":[41],"so":[42],"far.":[43],"They":[44],"are":[45],"either":[46],"designed":[47],"with":[48,64,195],"containing":[50],"only":[51],"clues":[53,117],"without":[54],"taking":[55],"any":[56],"information":[58,122,201],"into":[59],"account,":[60],"or":[61,130],"considering":[62],"restrictions":[65],"specific":[67],"scenarios,":[68],"such":[69],"as":[70],"panoramic":[71],"about":[75],"music":[76],"performances.":[77],"In":[78],"paper,":[80],"overcome":[82],"the":[83,157,174,191],"limitations":[84],"of":[85,159,193,199],"existing":[86],"datasets,":[87],"we":[88,133],"introduce":[89],"AVQA,":[90],"new":[92],"audio-visual":[93,108,175,203],"dataset":[96,210],"scenarios.":[101,208],"We":[102],"collect":[103],"57,015":[104],"from":[106,118],"daily":[107],"activities":[109],"57,335":[111],"specially-designed":[112],"question-answer":[113],"pairs":[114],"relying":[115],"modalities,":[120],"where":[121],"contained":[123],"single":[126],"modality":[127],"is":[128,211],"insufficient":[129],"ambiguous.":[131],"Furthermore,":[132],"propose":[134],"Hierarchical":[136],"Audio-Visual":[137],"Fusing":[138],"module":[139],"model":[141],"multiple":[142],"semantic":[143],"correlations":[144],"among":[145],"audio,":[146],"visual,":[147],"text":[149],"conduct":[152],"ablation":[153],"studies":[154],"analyze":[156],"role":[158],"different":[160],"our":[163,169],"datasets.":[164],"Experimental":[165],"results":[166],"show":[167],"that":[168],"proposed":[170],"method":[171],"significantly":[172],"improves":[173],"performance":[178],"over":[179],"various":[180],"types.":[182],"Therefore,":[183],"AVQA":[184],"can":[185],"provide":[186],"an":[187],"adequate":[188],"testbed":[189],"generation":[192],"models":[194],"deeper":[197],"understanding":[198],"multimodal":[200],"(The":[209],"available":[212],"at":[213],"https://mn.cs.tsinghua.edu.cn/avqa)":[214]},"counts_by_year":[{"year":2026,"cited_by_count":6},{"year":2025,"cited_by_count":14},{"year":2024,"cited_by_count":21},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":1}],"updated_date":"2026-06-16T09:24:06.705377","created_date":"2025-10-10T00:00:00"}
