{"id":"https://openalex.org/W7160522012","doi":"https://doi.org/10.48550/arxiv.2605.04547","title":"Stage-adaptive audio diffusion modeling","display_name":"Stage-adaptive audio diffusion modeling","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160522012","doi":"https://doi.org/10.48550/arxiv.2605.04547"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.04547","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04547","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.04547","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135565143","display_name":"Xuanhao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Xuanhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135553271","display_name":"Chang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5135565143"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.3700000047683716,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T10283","display_name":"Hearing Loss and Rehabilitation","score":0.3700000047683716,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.1606999933719635,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.1273999959230423,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inefficiency","display_name":"Inefficiency","score":0.6147000193595886},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.4343000054359436},{"id":"https://openalex.org/keywords/regularization","display_name":"Regularization (linguistics)","score":0.3962000012397766},{"id":"https://openalex.org/keywords/audio-signal-processing","display_name":"Audio signal processing","score":0.3873000144958496},{"id":"https://openalex.org/keywords/emphasis","display_name":"Emphasis (telecommunications)","score":0.3684999942779541},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.36399999260902405},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.3456999957561493}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.72079998254776},{"id":"https://openalex.org/C2778869765","wikidata":"https://www.wikidata.org/wiki/Q6028363","display_name":"Inefficiency","level":2,"score":0.6147000193595886},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.4343000054359436},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.42750000953674316},{"id":"https://openalex.org/C2776135515","wikidata":"https://www.wikidata.org/wiki/Q17143721","display_name":"Regularization (linguistics)","level":2,"score":0.3962000012397766},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.3873000144958496},{"id":"https://openalex.org/C177454536","wikidata":"https://www.wikidata.org/wiki/Q578290","display_name":"Emphasis (telecommunications)","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.36399999260902405},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3456999957561493},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32600000500679016},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.30979999899864197},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28519999980926514},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2750999927520752},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.2671000063419342},{"id":"https://openalex.org/C4069607","wikidata":"https://www.wikidata.org/wiki/Q868732","display_name":"Aliasing","level":3,"score":0.2612000107765198},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.259799987077713},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.2547000050544739}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.04547","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04547","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.04547","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.04547","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"progress":[1,124],"in":[2,65,160],"diffusion-based":[3],"audio":[4,18,25,169,206],"generation":[5,19,170,190],"and":[6,20,31,72,85,98,152,171,184,191,217],"restoration":[7],"has":[8],"substantially":[9],"improved":[10],"performance":[11],"across":[12],"heterogeneous":[13],"conditioning":[14],"regimes,":[15],"including":[16],"text-conditioned":[17,168],"audio-conditioned":[21,172],"super-resolution.":[22,173],"However,":[23],"training":[24,47,76,91,208],"diffusion":[26,207],"models":[27],"remains":[28],"computationally":[29],"expensive,":[30],"most":[32],"existing":[33],"pipelines":[34],"still":[35],"rely":[36],"on":[37,80,128,167,187],"static":[38,197],"optimization":[39,218],"recipes":[40],"that":[41,58,204],"treat":[42],"the":[43,66,114,149,177,188,202],"relative":[44],"importance":[45],"of":[46,62,117],"signals":[48],"as":[49,220],"fixed":[50,225],"throughout":[51],"learning.":[52],"In":[53],"this":[54,103,129],"work,":[55],"we":[56,106,131],"argue":[57],"a":[59,108],"major":[60],"source":[61],"inefficiency":[63],"lies":[64],"evolving":[67,104],"balance":[68],"between":[69],"semantic":[70,83,123,142],"acquisition":[71],"generation-oriented":[73],"refinement.":[74,100],"Early":[75],"places":[77],"stronger":[78],"emphasis":[79,219],"acquiring":[81],"condition-aligned":[82],"structure":[84],"coarse":[86],"global":[87],"organization,":[88,216],"whereas":[89],"later":[90],"increasingly":[92],"emphasizes":[93],"temporal":[94],"consistency,":[95],"perceptual":[96],"fidelity,":[97],"fine-detail":[99],"To":[101],"characterize":[102],"balance,":[105],"introduce":[107],"progress-based":[109],"regime":[110,150],"variable":[111],"derived":[112],"from":[113,156,211],"training-time":[115],"slope":[116],"an":[118],"SSL-space":[119],"discrepancy,":[120],"which":[121],"measures":[122],"during":[125],"training.":[126],"Based":[127],"signal,":[130],"develop":[132],"three":[133],"complementary":[134],"stage-aware":[135,179],"mechanisms:":[136],"decayed":[137],"SSL":[138],"guidance":[139],"for":[140],"early":[141],"bootstrapping,":[143],"self-adaptive":[144],"timestep":[145],"sampling":[146],"driven":[147],"by":[148],"variable,":[151],"structure-aware":[153],"regularization":[154],"activated":[155],"convergent":[157],"grouped":[158],"organization":[159],"parameter":[161],"space.":[162],"We":[163],"evaluate":[164],"these":[165],"mechanisms":[166],"Across":[174],"both":[175],"settings,":[176],"proposed":[178],"strategies":[180],"improve":[181],"convergence":[182],"behavior":[183],"yield":[185],"gains":[186],"primary":[189],"spectral":[192],"reconstruction":[193],"metrics":[194],"over":[195],"standard":[196],"baselines.":[198],"These":[199],"results":[200],"support":[201],"view":[203],"efficient":[205],"can":[209],"benefit":[210],"treating":[212],"external":[213],"guidance,":[214],"internal":[215],"stage-dependent":[221],"components":[222],"rather":[223],"than":[224],"ingredients.":[226]},"counts_by_year":[],"updated_date":"2026-05-08T13:18:25.657630","created_date":"2026-05-08T00:00:00"}
