{"id":"https://openalex.org/W4394862844","doi":"https://doi.org/10.1109/taslp.2024.3389636","title":"Masked Modeling Duo: Towards a Universal Audio Pre-Training Framework","display_name":"Masked Modeling Duo: Towards a Universal Audio Pre-Training Framework","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4394862844","doi":"https://doi.org/10.1109/taslp.2024.3389636"},"language":"en","primary_location":{"id":"doi:10.1109/taslp.2024.3389636","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3389636","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10502167.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10502167.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091219538","display_name":"Daisuke Niizumi","orcid":"https://orcid.org/0000-0002-5063-0508"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Daisuke Niizumi","raw_affiliation_strings":["Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"affiliations":[{"raw_affiliation_string":"Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103978078","display_name":"Daiki Takeuchi","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Daiki Takeuchi","raw_affiliation_strings":["Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"affiliations":[{"raw_affiliation_string":"Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062509967","display_name":"Yasunori Ohishi","orcid":"https://orcid.org/0000-0002-7856-248X"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yasunori Ohishi","raw_affiliation_strings":["Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"affiliations":[{"raw_affiliation_string":"Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054467679","display_name":"Noboru Harada","orcid":"https://orcid.org/0000-0002-1759-4533"},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Noboru Harada","raw_affiliation_strings":["Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"affiliations":[{"raw_affiliation_string":"Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061465935","display_name":"Kunio Kashino","orcid":null},"institutions":[{"id":"https://openalex.org/I2251713219","display_name":"NTT (Japan)","ror":"https://ror.org/00berct97","country_code":"JP","type":"company","lineage":["https://openalex.org/I2251713219"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kunio Kashino","raw_affiliation_strings":["Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan"],"affiliations":[{"raw_affiliation_string":"Communication Science Laboratories, Nippon Telegraph and Telephone Corporation, Atsugi, Japan","institution_ids":["https://openalex.org/I2251713219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5091219538"],"corresponding_institution_ids":["https://openalex.org/I2251713219"],"apc_list":null,"apc_paid":null,"fwci":5.9299,"has_fulltext":true,"cited_by_count":24,"citation_normalized_percentile":{"value":0.97184509,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":"32","issue":null,"first_page":"2391","last_page":"2406"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7099999785423279,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.7099999785423279,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.561976969242096},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5371443033218384},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4410054683685303},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.36728158593177795},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.0558130145072937}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.561976969242096},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5371443033218384},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4410054683685303},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.36728158593177795},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0558130145072937},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taslp.2024.3389636","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3389636","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10502167.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/taslp.2024.3389636","is_oa":true,"landing_page_url":"https://doi.org/10.1109/taslp.2024.3389636","pdf_url":"https://ieeexplore.ieee.org/ielx7/6570655/6633080/10502167.pdf","source":{"id":"https://openalex.org/S4210169297","display_name":"IEEE/ACM Transactions on Audio Speech and Language Processing","issn_l":"2329-9290","issn":["2329-9290","2329-9304"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE/ACM Transactions on Audio, Speech, and Language Processing","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/5","display_name":"Gender equality","score":0.5199999809265137}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4394862844.pdf","grobid_xml":"https://content.openalex.org/works/W4394862844.grobid-xml"},"referenced_works_count":85,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2030931454","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2108598243","https://openalex.org/W2133824856","https://openalex.org/W2593116425","https://openalex.org/W2726515241","https://openalex.org/W2797583228","https://openalex.org/W2896457183","https://openalex.org/W2912223386","https://openalex.org/W2936774411","https://openalex.org/W2962904371","https://openalex.org/W2965373594","https://openalex.org/W2980708516","https://openalex.org/W2982223350","https://openalex.org/W3041561163","https://openalex.org/W3094550259","https://openalex.org/W3115729981","https://openalex.org/W3159239022","https://openalex.org/W3171007011","https://openalex.org/W3196974791","https://openalex.org/W3197580070","https://openalex.org/W3198429080","https://openalex.org/W3201143670","https://openalex.org/W3202667537","https://openalex.org/W3203140070","https://openalex.org/W3204696009","https://openalex.org/W3205475937","https://openalex.org/W3206996142","https://openalex.org/W3209059054","https://openalex.org/W3209376089","https://openalex.org/W3209984917","https://openalex.org/W3214556263","https://openalex.org/W4205689591","https://openalex.org/W4224924125","https://openalex.org/W4225713393","https://openalex.org/W4226442948","https://openalex.org/W4281393357","https://openalex.org/W4293370787","https://openalex.org/W4294891479","https://openalex.org/W4297841518","https://openalex.org/W4297841853","https://openalex.org/W4300957348","https://openalex.org/W4309398205","https://openalex.org/W4310873011","https://openalex.org/W4312891522","https://openalex.org/W4312939067","https://openalex.org/W4313156423","https://openalex.org/W4313162486","https://openalex.org/W4317438080","https://openalex.org/W4319862404","https://openalex.org/W4372260141","https://openalex.org/W4372266902","https://openalex.org/W4372341324","https://openalex.org/W4372346788","https://openalex.org/W4375869301","https://openalex.org/W4375869340","https://openalex.org/W4385208579","https://openalex.org/W4385764089","https://openalex.org/W4385822870","https://openalex.org/W4385823002","https://openalex.org/W4385823092","https://openalex.org/W4386072059","https://openalex.org/W4386076385","https://openalex.org/W4386221015","https://openalex.org/W6726497184","https://openalex.org/W6736723571","https://openalex.org/W6745057489","https://openalex.org/W6745136726","https://openalex.org/W6750665317","https://openalex.org/W6766673545","https://openalex.org/W6776700526","https://openalex.org/W6780218876","https://openalex.org/W6784333009","https://openalex.org/W6785932716","https://openalex.org/W6810007534","https://openalex.org/W6810405975","https://openalex.org/W6810510949","https://openalex.org/W6810784583","https://openalex.org/W6840200333","https://openalex.org/W6845692261","https://openalex.org/W6846877914","https://openalex.org/W6848208918","https://openalex.org/W6849283408"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W2358668433","https://openalex.org/W2810751659","https://openalex.org/W258997015","https://openalex.org/W2376932109","https://openalex.org/W2001405890"],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1],"(SSL)":[2],"using":[3,215],"masked":[4,23,32,52],"prediction":[5,24],"has":[6,100],"made":[7],"great":[8],"strides":[9],"in":[10,58,80,91,97,106],"general-purpose":[11,67,192],"audio":[12,68,221],"representation.":[13],"This":[14],"study":[15],"proposes":[16],"Masked":[17],"Modeling":[18],"Duo":[19],"(M2D),":[20],"an":[21,127,135],"improved":[22],"SSL,":[25],"which":[26,116],"learns":[27,131],"by":[28,48],"predicting":[29],"representations":[30,125,190],"of":[31,123,214],"input":[33],"signals":[34],"that":[35,105,166,188],"serve":[36,149,181],"as":[37,79,218],"training":[38,46],"signals.":[39],"Unlike":[40],"conventional":[41],"methods,":[42],"M2D":[43,59,65,112,118,133],"obtains":[44],"a":[45,70,101,163,177,204,219],"signal":[47],"encoding":[49],"only":[50],"the":[51,55,62,121,144,153,189,196,212],"part,":[53],"encouraging":[54],"two":[56],"networks":[57],"to":[60,119,148,180],"model":[61],"input.":[63],"While":[64],"improves":[66],"representations,":[69],"specialized":[71,124,179,194],"representation":[72,168,178],"is":[73,94,226],"essential":[74],"for":[75,113,126,191,195,229],"real-world":[76],"applications,":[77,151],"such":[78,92],"industrial":[81],"and":[82,88,99,134,138,161,200,203],"medical":[83,206],"domains.":[84],"The":[85],"often":[86],"confidential":[87],"proprietary":[89],"data":[90,160],"domains":[93],"typically":[95],"limited":[96],"size":[98],"different":[102],"distribution":[103],"from":[104,132],"pre-training":[107,122,222],"datasets.":[108],"Therefore,":[109],"we":[110],"propose":[111],"X":[114],"(M2D-X),":[115],"extends":[117],"enable":[120],"application":[128,183],"X.":[129],"M2D-X":[130,174],"additional":[136,145],"task":[137,146,165,207],"inputs":[139],"background":[140,154],"noise.":[141],"We":[142],"make":[143],"configurable":[147],"diverse":[150],"while":[152],"noise":[155],"helps":[156],"learn":[157,176],"on":[158],"small":[159],"forms":[162],"denoising":[164],"makes":[167],"robust.":[169],"With":[170],"these":[171],"design":[172],"choices,":[173],"should":[175],"various":[182],"needs.":[184],"Our":[185,224],"experiments":[186],"confirmed":[187],"audio,":[193],"highly":[197],"competitive":[198],"AudioSet":[199],"speech":[201],"domain,":[202],"small-data":[205],"achieve":[208],"top-level":[209],"performance,":[210],"demonstrating":[211],"potential":[213],"our":[216],"models":[217],"universal":[220],"framework.":[223],"code":[225],"available":[227],"online":[228],"future":[230],"studies.":[231]},"counts_by_year":[{"year":2025,"cited_by_count":20},{"year":2024,"cited_by_count":4}],"updated_date":"2026-04-03T22:45:19.894376","created_date":"2025-10-10T00:00:00"}
