{"id":"https://openalex.org/W4403511263","doi":"https://doi.org/10.1109/tpami.2024.3479776","title":"VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset","display_name":"VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset","publication_year":2024,"publication_date":"2024-10-17","ids":{"openalex":"https://openalex.org/W4403511263","doi":"https://doi.org/10.1109/tpami.2024.3479776","pmid":"https://pubmed.ncbi.nlm.nih.gov/39418158"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2024.3479776","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3479776","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5108392430","display_name":"Jing Liu","orcid":"https://orcid.org/0000-0003-0903-9131"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jing Liu","raw_affiliation_strings":["School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences and National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210165038"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences and National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067083698","display_name":"Sihan Chen","orcid":"https://orcid.org/0009-0001-3539-8085"},"institutions":[{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sihan Chen","raw_affiliation_strings":["School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","School of Artificial Intelligence, University of Chinese Academy of Sciences and National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210165038"]},{"raw_affiliation_string":"School of Artificial Intelligence, University of Chinese Academy of Sciences and National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101777772","display_name":"Xingjian He","orcid":"https://orcid.org/0000-0001-5396-6253"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingjian He","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040424302","display_name":"Longteng Guo","orcid":"https://orcid.org/0000-0002-4340-4000"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longteng Guo","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103047969","display_name":"Xinxin Zhu","orcid":"https://orcid.org/0000-0002-2142-5580"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinxin Zhu","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100683352","display_name":"Weining Wang","orcid":"https://orcid.org/0000-0001-7299-6431"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210112150","display_name":"Institute of Automation","ror":"https://ror.org/022c3hy66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210112150"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weining Wang","raw_affiliation_strings":["National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]},{"raw_affiliation_string":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210112150","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035112538","display_name":"Jinhui Tang","orcid":"https://orcid.org/0000-0001-9008-222X"},"institutions":[{"id":"https://openalex.org/I36399199","display_name":"Nanjing University of Science and Technology","ror":"https://ror.org/00xp9wg62","country_code":"CN","type":"education","lineage":["https://openalex.org/I36399199"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinhui Tang","raw_affiliation_strings":["School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","Nanjing University of Science and Technology, School of Computer Science and Engineering, China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China","institution_ids":["https://openalex.org/I36399199"]},{"raw_affiliation_string":"Nanjing University of Science and Technology, School of Computer Science and Engineering, China","institution_ids":["https://openalex.org/I36399199"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5108392430"],"corresponding_institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I4210112150","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":10.181,"has_fulltext":false,"cited_by_count":37,"citation_normalized_percentile":{"value":0.98951049,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":"47","issue":"2","first_page":"708","last_page":"724"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9577000141143799,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9577000141143799,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.687637984752655},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.6551164388656616},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6497082114219666},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5115107297897339},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5010819435119629},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.44694986939430237},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3731059432029724},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.32421785593032837},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.16294366121292114},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.15435370802879333}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.687637984752655},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.6551164388656616},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6497082114219666},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5115107297897339},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5010819435119629},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.44694986939430237},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3731059432029724},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.32421785593032837},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.16294366121292114},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.15435370802879333},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpami.2024.3479776","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2024.3479776","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:39418158","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/39418158","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1283858192","display_name":null,"funder_award_id":"U21B2043","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1508456709","display_name":null,"funder_award_id":"62102416","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":118,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W1956340063","https://openalex.org/W2277195237","https://openalex.org/W2425121537","https://openalex.org/W2506483933","https://openalex.org/W2560730294","https://openalex.org/W2593116425","https://openalex.org/W2606982687","https://openalex.org/W2745461083","https://openalex.org/W2765716052","https://openalex.org/W2886641317","https://openalex.org/W2896457183","https://openalex.org/W2914699769","https://openalex.org/W2952132648","https://openalex.org/W2963017553","https://openalex.org/W2963084599","https://openalex.org/W2963916161","https://openalex.org/W2964220823","https://openalex.org/W2979936813","https://openalex.org/W2984008963","https://openalex.org/W2984862483","https://openalex.org/W2989322838","https://openalex.org/W2997805943","https://openalex.org/W3015591594","https://openalex.org/W3035365026","https://openalex.org/W3035635319","https://openalex.org/W3037773948","https://openalex.org/W3043840704","https://openalex.org/W3090449556","https://openalex.org/W3091588028","https://openalex.org/W3105232955","https://openalex.org/W3130796238","https://openalex.org/W3138516171","https://openalex.org/W3158986867","https://openalex.org/W3160577380","https://openalex.org/W3168640669","https://openalex.org/W3173220247","https://openalex.org/W3176481196","https://openalex.org/W3176641147","https://openalex.org/W3176689360","https://openalex.org/W3187963534","https://openalex.org/W3196974791","https://openalex.org/W3197457832","https://openalex.org/W3197828817","https://openalex.org/W3198452188","https://openalex.org/W3204588463","https://openalex.org/W3204670646","https://openalex.org/W3206634578","https://openalex.org/W3206996142","https://openalex.org/W3217102353","https://openalex.org/W3217340782","https://openalex.org/W3217578129","https://openalex.org/W4212841753","https://openalex.org/W4221142658","https://openalex.org/W4226289673","https://openalex.org/W4285606530","https://openalex.org/W4304014690","https://openalex.org/W4304098310","https://openalex.org/W4312299780","https://openalex.org/W4312380001","https://openalex.org/W4312384316","https://openalex.org/W4312463400","https://openalex.org/W4312538879","https://openalex.org/W4312560592","https://openalex.org/W4312661097","https://openalex.org/W4312746376","https://openalex.org/W4312784228","https://openalex.org/W4312864639","https://openalex.org/W4312881242","https://openalex.org/W4312898271","https://openalex.org/W4312922092","https://openalex.org/W4313190371","https://openalex.org/W4382202943","https://openalex.org/W4385445765","https://openalex.org/W4385572645","https://openalex.org/W4386076176","https://openalex.org/W4386076615","https://openalex.org/W4386076661","https://openalex.org/W4386083024","https://openalex.org/W6620707391","https://openalex.org/W6676497082","https://openalex.org/W6678262379","https://openalex.org/W6684090549","https://openalex.org/W6752083267","https://openalex.org/W6766904570","https://openalex.org/W6778883912","https://openalex.org/W6779068807","https://openalex.org/W6784184991","https://openalex.org/W6784333009","https://openalex.org/W6789909235","https://openalex.org/W6790019176","https://openalex.org/W6791353385","https://openalex.org/W6793736971","https://openalex.org/W6795711426","https://openalex.org/W6797109355","https://openalex.org/W6798805250","https://openalex.org/W6798955355","https://openalex.org/W6800139874","https://openalex.org/W6801013943","https://openalex.org/W6802305790","https://openalex.org/W6803872405","https://openalex.org/W6803953248","https://openalex.org/W6805349323","https://openalex.org/W6810042059","https://openalex.org/W6810090734","https://openalex.org/W6810334672","https://openalex.org/W6811013733","https://openalex.org/W6811072154","https://openalex.org/W6838434436","https://openalex.org/W6842585177","https://openalex.org/W6845577938","https://openalex.org/W6846087911","https://openalex.org/W6846305988","https://openalex.org/W6846867676","https://openalex.org/W6849762702","https://openalex.org/W6850204008","https://openalex.org/W6864544085","https://openalex.org/W6898505805"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W3137890128","https://openalex.org/W1984634519","https://openalex.org/W4245955731","https://openalex.org/W2393726419"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,108],"propose":[4],"the":[5,24,60,79],"Vision-Audio-Language":[6],"Omni-peRception":[7],"pretraining":[8,19,106],"model":[9],"(VALOR)":[10],"for":[11,41,48],"multimodal":[12,49,134],"understanding":[13],"and":[14,29,45,67,76,87,136,145,155],"generation.":[15,52],"Unlike":[16],"widely-studied":[17],"vision-language":[18],"models,":[20],"VALOR":[21,61,130,157],"jointly":[22],"models":[23],"relationships":[25],"among":[26],"vision,":[27,74,99],"audio,":[28,100],"language":[30],"in":[31],"an":[32],"end-to-end":[33],"manner.":[34],"It":[35],"consists":[36],"of":[37,98,165],"three":[38],"separate":[39],"encoders":[40],"single":[42],"modality":[43],"representations":[44],"a":[46,110,163],"decoder":[47],"conditional":[50],"text":[51,94],"We":[53],"design":[54],"two":[55],"pretext":[56],"tasks":[57,141],"to":[58,92,138],"pretrain":[59],"model:":[62],"Multimodal":[63,68],"Grouping":[64,69],"Alignment":[65],"(MGA)":[66],"Captioning":[70],"(MGC).":[71],"MGA":[72],"projects":[73],"language,":[75],"audio":[77],"into":[78],"same":[80],"common":[81],"space,":[82],"simultaneously":[83],"building":[84],"vision-language,":[85,153],"audio-language,":[86,154],"audiovisual-language":[88],"alignment.":[89],"MGC":[90],"learns":[91],"generate":[93],"tokens":[95],"under":[96],"conditions":[97],"or":[101],"both.":[102],"To":[103],"promote":[104],"vision-audio-language":[105],"research,":[107],"construct":[109],"large-scale,":[111],"high-quality":[112],"tri-modality":[113],"dataset":[114],"named":[115],"VALOR-1M,":[116],"containing":[117],"1":[118],"million":[119],"audible":[120],"videos":[121],"with":[122,148],"human-annotated":[123],"audiovisual":[124],"captions.":[125],"Extensive":[126],"experiments":[127],"show":[128],"that":[129],"can":[131],"learn":[132],"strong":[133],"correlations":[135],"generalize":[137],"various":[139],"downstream":[140],"(e.g.,":[142,152],"retrieval,":[143],"captioning,":[144],"question":[146],"answering)":[147],"different":[149],"input":[150],"modalities":[151],"audiovisual-language).":[156],"achieves":[158],"new":[159],"state-of-the-art":[160],"performance":[161],"on":[162],"series":[164],"public":[166],"cross-modality":[167],"benchmarks.":[168]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":21},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":7}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
