{"id":"https://openalex.org/W6911884374","doi":"https://doi.org/10.5281/zenodo.14877363","title":"SpecMaskGIT: Masked Generative Modeling of Audio Spectrogram for Efficient Audio Synthesis and Beyond","display_name":"SpecMaskGIT: Masked Generative Modeling of Audio Spectrogram for Efficient Audio Synthesis and Beyond","publication_year":2024,"publication_date":"2024-11-10","ids":{"openalex":"https://openalex.org/W6911884374","doi":"https://doi.org/10.5281/zenodo.14877363"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.14877363","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877363","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.14877363","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Marco Comunit\u00e0","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Marco Comunit\u00e0","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhi Zhong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhi Zhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Akira Takahashi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Akira Takahashi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shiqi Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shiqi Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Mengjie Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mengjie Zhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Koichi Saito","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koichi Saito","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Yukara Ikemiya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yukara Ikemiya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Takashi Shibuya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takashi Shibuya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shusuke Takahashi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shusuke Takahashi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Yuki Mitsufuji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuki Mitsufuji","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":10,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.35753967,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.2159000039100647,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.2159000039100647,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.20919999480247498,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.07249999791383743,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spectrogram","display_name":"Spectrogram","score":0.6671000123023987},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6552000045776367},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6176000237464905},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5442000031471252},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5189999938011169},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.4846999943256378},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.43160000443458557},{"id":"https://openalex.org/keywords/digital-audio","display_name":"Digital audio","score":0.4189999997615814}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7960000038146973},{"id":"https://openalex.org/C45273575","wikidata":"https://www.wikidata.org/wiki/Q578970","display_name":"Spectrogram","level":2,"score":0.6671000123023987},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6552000045776367},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6176000237464905},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5442000031471252},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5189999938011169},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4970000088214874},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.4846999943256378},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.43160000443458557},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.4189999997615814},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.4099999964237213},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.4092000126838684},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3804999887943268},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.36910000443458557},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3273000121116638},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.304500013589859},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.30219998955726624},{"id":"https://openalex.org/C150178126","wikidata":"https://www.wikidata.org/wiki/Q18433212","display_name":"Dynamic range compression","level":2,"score":0.2888000011444092},{"id":"https://openalex.org/C117619785","wikidata":"https://www.wikidata.org/wiki/Q6094414","display_name":"Iterative learning control","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.2563000023365021}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.14877363","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877363","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":""}],"best_oa_location":{"id":"doi:10.5281/zenodo.14877363","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.14877363","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":""},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","display_name":"Reduced inequalities","score":0.7083306908607483}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,13,50,91,117],"generative":[3,78,169],"models":[4,116],"that":[5,188],"iteratively":[6],"synthesize":[7],"audio":[8,89,174,182,197],"clips":[9],"sparked":[10],"great":[11],"success":[12],"text-to-audio":[14],"synthesis":[15,23],"(TTA),":[16],"but":[17],"at":[18],"the":[19,35,45,51,76,193],"cost":[20],"of":[21,47,57,80,98,142,149,195],"slow":[22],"speed":[24],"and":[25,54,114,177],"heavy":[26],"computation.":[27],"Although":[28],"there":[29],"have":[30],"been":[31],"attempts":[32],"to":[33,44,171],"accelerate":[34],"iterative":[36,103],"procedure,":[37],"high-quality":[38],"TTA":[39,72,104,119],"systems":[40],"remain":[41],"inefficient":[42],"due":[43],"hundreds":[46],"iterations":[48],"required":[49],"inference":[52],"phase":[53],"large":[55],"amount":[56],"model":[58,73],"parameters.":[59],"To":[60],"address":[61],"these":[62],"challenges,":[63],"we":[64,164],"propose":[65],"SpecMaskGIT,":[66],"a":[67,85,107,118,134,139,146,168],"light-weight,":[68],"efficient":[69],"yet":[70],"effective":[71],"based":[74],"on":[75,159,180],"masked":[77,175,196],"modeling":[79,198],"spectrograms.":[81],"First,":[82],"SpecMaskGIT":[83,110,144,166],"synthesizes":[84],"realistic":[86],"10":[87],"s":[88],"clip":[90],"less":[92,100],"than":[93,101,155],"16":[94],"iterations,":[95],"an":[96],"order":[97],"magnitude":[99],"previous":[102,172],"methods.":[105],"As":[106],"discrete":[108],"model,":[109],"outperforms":[111],"larger":[112],"VQ-Diffusion":[113],"auto-regressive":[115],"benchmark,":[120],"while":[121],"being":[122],"real-time":[123],"with":[124,133],"only":[125],"4":[126],"CPU":[127],"cores":[128],"or":[129],"even":[130],"30\u00d7":[131],"faster":[132],"GPU.":[135],"Next,":[136],"built":[137,158],"upon":[138],"latent":[140,160],"space":[141],"Mel-spectrograms,":[143],"has":[145],"wider":[147],"range":[148],"applications":[150],"(e.g.,":[151],"zero-shot":[152],"bandwidth":[153],"extension)":[154],"similar":[156],"methods":[157],"wave":[161],"domains.":[162],"Moreover,":[163],"interpret":[165],"as":[167],"extension":[170],"discriminative":[173],"Transformers,":[176],"shed":[178],"light":[179],"its":[181],"representation":[183],"learning":[184],"potential.":[185],"We":[186],"hope":[187],"our":[189],"work":[190],"will":[191],"inspire":[192],"exploration":[194],"toward":[199],"further":[200],"diverse":[201],"scenarios.":[202]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
