{"id":"https://openalex.org/W7154421540","doi":"https://doi.org/10.48550/arxiv.2604.10708","title":"Audio-Omni: Extending Multi-modal Understanding to Versatile Audio Generation and Editing","display_name":"Audio-Omni: Extending Multi-modal Understanding to Versatile Audio Generation and Editing","publication_year":2026,"publication_date":"2026-04-12","ids":{"openalex":"https://openalex.org/W7154421540","doi":"https://doi.org/10.48550/arxiv.2604.10708"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.10708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.10708","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133570849","display_name":"Zeyue Tian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Zeyue","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133578998","display_name":"Binxin Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Binxin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133586747","display_name":"Zhaoyang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Zhaoyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051262853","display_name":"J. Zhang","orcid":"https://orcid.org/0009-0008-0999-1046"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Jiexuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133587376","display_name":"Ruibin Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Ruibin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133617216","display_name":"Hubery Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Hubery","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133615940","display_name":"Qifeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Qifeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133564315","display_name":"Chen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071577123","display_name":"Jing Lv","orcid":"https://orcid.org/0009-0009-7962-8741"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133620963","display_name":"Wei Xue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Wei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133572583","display_name":"Yike Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yike","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.243599995970726,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.243599995970726,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.18569999933242798,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10530000180006027,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.6712999939918518},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4814999997615814},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.4471000134944916},{"id":"https://openalex.org/keywords/image-editing","display_name":"Image editing","score":0.39469999074935913},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.36660000681877136},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.3564000129699707}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8422999978065491},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.6712999939918518},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4814999997615814},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.4471000134944916},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.39469999074935913},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.36660000681877136},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3564000129699707},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3495999872684479},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3070000112056732},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.2734000086784363},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.10708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.10708","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.10708","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.587171196937561}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,9,111],"multimodal":[3],"models":[4],"has":[5],"spurred":[6],"rapid":[7],"advances":[8],"audio":[10,47,112,178,187],"understanding,":[11],"generation,":[12,50,170,172,179],"and":[13,49,71,77,173,192],"editing.":[14],"However,":[15],"these":[16],"capabilities":[17],"are":[18],"typically":[19],"addressed":[20],"by":[21],"specialized":[22,155],"models,":[23],"leaving":[24],"the":[25,64,107],"development":[26],"of":[27,140],"a":[28,88,98,117,138,181],"truly":[29],"unified":[30,144],"framework":[31,67],"that":[32,132],"can":[33],"seamlessly":[34],"integrate":[35],"all":[36],"three":[37],"tasks":[38],"underexplored.":[39],"While":[40],"some":[41],"pioneering":[42],"works":[43],"have":[44],"explored":[45],"unifying":[46],"understanding":[48,83],"they":[51],"often":[52],"remain":[53],"confined":[54],"to":[55,68,154],"specific":[56],"domains.":[57],"To":[58,105],"address":[59],"this,":[60],"we":[61,114],"introduce":[62],"Audio-Omni,":[63],"first":[65],"end-to-end":[66],"unify":[69],"generation":[70],"editing":[72,127],"across":[73,137],"general":[74],"sound,":[75],"music,":[76],"speech":[78],"domains,":[79],"with":[80,97,151],"integrated":[81],"multi-modal":[82],"capabilities.":[84],"Our":[85],"architecture":[86],"synergizes":[87],"frozen":[89],"Multimodal":[90],"Large":[91],"Language":[92],"Model":[93],"for":[94,102,177],"high-level":[95],"reasoning":[96,169],"trainable":[99],"Diffusion":[100],"Transformer":[101],"high-fidelity":[103],"synthesis.":[104],"overcome":[106],"critical":[108],"data":[109],"scarcity":[110],"editing,":[113],"construct":[115],"AudioEdit,":[116],"new":[118],"large-scale":[119],"dataset":[120,193],"comprising":[121],"over":[122],"one":[123],"million":[124],"meticulously":[125],"curated":[126],"pairs.":[128],"Extensive":[129],"experiments":[130],"demonstrate":[131],"Audio-Omni":[133,162],"achieves":[134],"state-of-the-art":[135],"performance":[136,148],"suite":[139],"benchmarks,":[141],"outperforming":[142],"prior":[143],"approaches":[145],"while":[146],"achieving":[147],"on":[149,198],"par":[150],"or":[152],"superior":[153],"expert":[156],"models.":[157],"Beyond":[158],"its":[159],"core":[160],"capabilities,":[161,166],"exhibits":[163],"remarkable":[164],"inherited":[165],"including":[167],"knowledge-augmented":[168],"in-context":[171],"zero-shot":[174],"cross-lingual":[175],"control":[176],"highlighting":[180],"promising":[182],"direction":[183],"toward":[184],"universal":[185],"generative":[186],"intelligence.":[188],"The":[189],"code,":[190],"model,":[191],"will":[194],"be":[195],"publicly":[196],"released":[197],"https://zeyuet.github.io/Audio-Omni.":[199]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-15T00:00:00"}
