{"id":"https://openalex.org/W7154207876","doi":"https://doi.org/10.48550/arxiv.2604.10905","title":"Audio Flamingo Next: Next-Generation Open Audio-Language Models for Speech, Sound, and Music","display_name":"Audio Flamingo Next: Next-Generation Open Audio-Language Models for Speech, Sound, and Music","publication_year":2026,"publication_date":"2026-04-13","ids":{"openalex":"https://openalex.org/W7154207876","doi":"https://doi.org/10.48550/arxiv.2604.10905"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10905","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10905","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10905","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133564754","display_name":"Sreyan Ghosh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghosh, Sreyan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031240854","display_name":"Arushi Goel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goel, Arushi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056483046","display_name":"K. S. Jayakumar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jayakumar, Kaousheik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133572864","display_name":"Lasha Koroshinadze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koroshinadze, Lasha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5076391965","display_name":"Nishit Anand","orcid":"https://orcid.org/0000-0001-9391-1871"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anand, Nishit","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042062740","display_name":"Zhifeng Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Zhifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006331697","display_name":"Siddharth Gururani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gururani, Siddharth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133595642","display_name":"Sang-gil Lee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lee, Sang-gil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058721011","display_name":"Jaehyeon Kim","orcid":"https://orcid.org/0000-0001-9347-3680"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Jaehyeon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099082856","display_name":"Aya Aljafari","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aljafari, Aya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133595244","display_name":"Chao-Han Huck Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Chao-Han Huck","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133556749","display_name":"Sungwon Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Sungwon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013222310","display_name":"Ramani Duraiswami","orcid":"https://orcid.org/0000-0002-5596-8460"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Duraiswami, Ramani","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004194238","display_name":"Dinesh Manocha","orcid":"https://orcid.org/0000-0001-7047-9801"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Manocha, Dinesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133604446","display_name":"Mohammad Shoeybi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shoeybi, Mohammad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133599911","display_name":"Bryan Catanzaro","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Catanzaro, Bryan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133617814","display_name":"Ming-Yu Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ming-Yu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133577213","display_name":"Wei Ping","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ping, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":18,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6773999929428101,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.6773999929428101,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.07429999858140945,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.025599999353289604,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5608999729156494},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.44690001010894775},{"id":"https://openalex.org/keywords/timestamp","display_name":"Timestamp","score":0.414000004529953},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.41290000081062317},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.396699994802475},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.35679998993873596}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7307000160217285},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5608999729156494},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.44690001010894775},{"id":"https://openalex.org/C113954288","wikidata":"https://www.wikidata.org/wiki/Q186885","display_name":"Timestamp","level":2,"score":0.414000004529953},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.41290000081062317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4043000042438507},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.396699994802475},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.35679998993873596},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3515999913215637},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.35019999742507935},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.33239999413490295},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3018999993801117},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2946999967098236},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2888999879360199},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.26010000705718994},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2526000142097473}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10905","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10905","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10905","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10905","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.5401307940483093,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,130],"present":[1],"Audio":[2,16,33,83,118],"Flamingo":[3,17,34,119],"Next":[4],"(AF-Next),":[5],"the":[6,15,149],"next-generation":[7],"and":[8,23,29,61,72,80,104,128,133,147,154,167,176,194,199,205,216,225,234,245],"most":[9],"capable":[10],"large":[11,192],"audio-language":[12,42],"model":[13,43],"in":[14,97,125],"series,":[18],"designed":[19],"to":[20,32,77,95,121,143,219,230],"advance":[21],"understanding":[22,51,60,127,175],"reasoning":[24,62,87,93,177],"over":[25,139],"speech,":[26],"environmental":[27],"sounds":[28],"music.":[30],"Compared":[31],"3,":[35],"AF-Next":[36,157,185,211],"introduces:":[37],"(i)":[38],"a":[39,85,114,161],"stronger":[40],"foundational":[41],"that":[44,89,184],"significantly":[45],"improves":[46],"accuracy":[47],"across":[48,172],"diverse":[49],"audio":[50,59,74,126,174],"tasks;":[52],"(ii)":[53],"scalable":[54],"strategies":[55],"for":[56,70],"constructing":[57],"large-scale":[58,136],"data":[63],"beyond":[64],"existing":[65,150],"academic":[66],"benchmarks;":[67],"(iii)":[68],"support":[69],"long":[71,98],"complex":[73],"inputs":[75],"up":[76],"30":[78],"minutes;":[79],"(iv)":[81],"Temporal":[82],"Chain-of-Thought,":[84],"new":[86,135],"paradigm":[88],"explicitly":[90],"grounds":[91],"intermediate":[92],"steps":[94],"timestamps":[96],"audio,":[99],"enabling":[100],"fine-grained":[101],"temporal":[102],"alignment":[103],"improved":[105],"interpretability.":[106],"To":[107],"enable":[108],"these":[109,145],"capabilities,":[110],"we":[111,236],"first":[112],"conduct":[113],"systematic":[115],"analysis":[116],"of":[117,240],"3":[120,238],"identify":[122],"key":[123],"gaps":[124],"reasoning.":[129],"then":[131],"curate":[132],"scale":[134],"datasets":[137],"totaling":[138],"1":[140],"million":[141],"hours":[142],"address":[144],"limitations":[146],"expand":[148],"AudioSkills-XL,":[151],"LongAudio-XL,":[152],"AF-Think":[153],"AF-Chat":[155],"datasets.":[156],"is":[158],"trained":[159],"using":[160],"curriculum-based":[162],"strategy":[163],"spanning":[164],"pre-training,":[165],"mid-training":[166],"post-training":[168],"stages.":[169],"Extensive":[170],"experiments":[171],"20":[173],"benchmarks,":[178],"including":[179,242],"challenging":[180],"long-audio":[181],"tasks,":[182,221],"show":[183],"outperforms":[186],"similarly":[187],"sized":[188],"open":[189],"models":[190],"by":[191],"margins":[193],"remains":[195],"highly":[196],"competitive":[197],"with":[198],"sometimes":[200],"surpasses,":[201],"much":[202],"larger":[203],"open-weight":[204],"closed":[206],"models.":[207],"Beyond":[208],"benchmark":[209],"performance,":[210],"exhibits":[212],"strong":[213],"real-world":[214],"utility":[215],"transfers":[217],"well":[218],"unseen":[220],"highlighting":[222],"its":[223],"robustness":[224],"generalization":[226],"ability.":[227],"In":[228],"addition":[229],"all":[231],"data,":[232],"code":[233],"methods,":[235],"open-source":[237],"variants":[239],"AF-Next,":[241],"AF-Next-Instruct,":[243],"AF-Next-Think":[244],"AF-Next-Captioner.":[246]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
