{"id":"https://openalex.org/W3205550549","doi":"https://doi.org/10.1109/icassp43922.2022.9746490","title":"Conformer-Based Self-Supervised Learning For Non-Speech Audio Tasks","display_name":"Conformer-Based Self-Supervised Learning For Non-Speech Audio Tasks","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W3205550549","doi":"https://doi.org/10.1109/icassp43922.2022.9746490","mag":"3205550549"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9746490","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746490","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5083384925","display_name":"Sangeeta Srivastava","orcid":"https://orcid.org/0000-0001-7614-8886"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sangeeta Srivastava","raw_affiliation_strings":["The Ohio State University,USA","The Ohio State University, USA"],"affiliations":[{"raw_affiliation_string":"The Ohio State University,USA","institution_ids":["https://openalex.org/I52357470"]},{"raw_affiliation_string":"The Ohio State University, USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100377647","display_name":"Yun Wang","orcid":"https://orcid.org/0000-0002-9732-9245"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun Wang","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038296765","display_name":"Andros Tjandra","orcid":"https://orcid.org/0000-0003-1246-5908"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Andros Tjandra","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080751032","display_name":"Anurag Kumar","orcid":"https://orcid.org/0000-0002-1164-144X"},"institutions":[{"id":"https://openalex.org/I4210140943","display_name":"REALITY Publishing (United States)","ror":"https://ror.org/04523n645","country_code":"US","type":"company","lineage":["https://openalex.org/I4210140943"]},{"id":"https://openalex.org/I4210128585","display_name":"META Health","ror":"https://ror.org/035h67p10","country_code":"US","type":"other","lineage":["https://openalex.org/I4210128585"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Anurag Kumar","raw_affiliation_strings":["Reality Labs Research,USA","Reality Labs Research, USA"],"affiliations":[{"raw_affiliation_string":"Reality Labs Research,USA","institution_ids":["https://openalex.org/I4210128585","https://openalex.org/I4210140943"]},{"raw_affiliation_string":"Reality Labs Research, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069296252","display_name":"Chunxi Liu","orcid":"https://orcid.org/0000-0001-5441-9374"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chunxi Liu","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091502081","display_name":"Kritika Singh","orcid":"https://orcid.org/0000-0002-6637-1571"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kritika Singh","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5051165898","display_name":"Yatharth Saraf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yatharth Saraf","raw_affiliation_strings":["Meta AI,USA","Meta AI, USA"],"affiliations":[{"raw_affiliation_string":"Meta AI,USA","institution_ids":[]},{"raw_affiliation_string":"Meta AI, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5083384925"],"corresponding_institution_ids":["https://openalex.org/I52357470"],"apc_list":null,"apc_paid":null,"fwci":1.2261,"has_fulltext":false,"cited_by_count":11,"citation_normalized_percentile":{"value":0.80793451,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"8862","last_page":"8866"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.796068549156189},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7187864184379578},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.60239577293396},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5532745122909546},{"id":"https://openalex.org/keywords/supervised-learning","display_name":"Supervised learning","score":0.5433775186538696},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5185323357582092},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.48064103722572327},{"id":"https://openalex.org/keywords/semi-supervised-learning","display_name":"Semi-supervised learning","score":0.4591467082500458},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.4368097186088562},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.42537981271743774},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3380184471607208},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.26846843957901}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.796068549156189},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7187864184379578},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.60239577293396},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5532745122909546},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.5433775186538696},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5185323357582092},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.48064103722572327},{"id":"https://openalex.org/C58973888","wikidata":"https://www.wikidata.org/wiki/Q1041418","display_name":"Semi-supervised learning","level":2,"score":0.4591467082500458},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.4368097186088562},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.42537981271743774},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3380184471607208},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.26846843957901},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9746490","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9746490","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6899999976158142,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":70,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W2052666245","https://openalex.org/W2127870748","https://openalex.org/W2593116425","https://openalex.org/W2619947201","https://openalex.org/W2765407302","https://openalex.org/W2767754137","https://openalex.org/W2883935097","https://openalex.org/W2896457183","https://openalex.org/W2936774411","https://openalex.org/W2948982921","https://openalex.org/W2963341956","https://openalex.org/W2963610932","https://openalex.org/W2964121744","https://openalex.org/W2982343573","https://openalex.org/W3005680577","https://openalex.org/W3015817524","https://openalex.org/W3025165719","https://openalex.org/W3034886385","https://openalex.org/W3034978746","https://openalex.org/W3036601975","https://openalex.org/W3037309139","https://openalex.org/W3040498734","https://openalex.org/W3093579165","https://openalex.org/W3094550259","https://openalex.org/W3095727342","https://openalex.org/W3097777922","https://openalex.org/W3099782249","https://openalex.org/W3100177202","https://openalex.org/W3112616666","https://openalex.org/W3126565544","https://openalex.org/W3134486096","https://openalex.org/W3139211892","https://openalex.org/W3146639881","https://openalex.org/W3154596443","https://openalex.org/W3157916917","https://openalex.org/W3158504903","https://openalex.org/W3160766462","https://openalex.org/W3164279099","https://openalex.org/W3166396011","https://openalex.org/W3170837227","https://openalex.org/W3180180466","https://openalex.org/W3186781156","https://openalex.org/W3196974791","https://openalex.org/W3198275944","https://openalex.org/W3198882010","https://openalex.org/W4286582832","https://openalex.org/W4295723153","https://openalex.org/W6631190155","https://openalex.org/W6678969435","https://openalex.org/W6734260513","https://openalex.org/W6745136726","https://openalex.org/W6752516136","https://openalex.org/W6755207826","https://openalex.org/W6774314701","https://openalex.org/W6779503413","https://openalex.org/W6780218876","https://openalex.org/W6780294235","https://openalex.org/W6784614252","https://openalex.org/W6787335539","https://openalex.org/W6790117948","https://openalex.org/W6791353385","https://openalex.org/W6791429434","https://openalex.org/W6791537541","https://openalex.org/W6793728465","https://openalex.org/W6793736971","https://openalex.org/W6798422254","https://openalex.org/W6799303324","https://openalex.org/W6840046036","https://openalex.org/W6955071965"],"related_works":["https://openalex.org/W122912556","https://openalex.org/W1586607209","https://openalex.org/W4312414840","https://openalex.org/W2621411691","https://openalex.org/W2271357838","https://openalex.org/W2556866732","https://openalex.org/W2348322200","https://openalex.org/W2981952041","https://openalex.org/W2328989934","https://openalex.org/W2515319207"],"abstract_inverted_index":{"Representation":[0],"learning":[1,18,35,49,75],"from":[2],"unlabeled":[3],"data":[4,92],"has":[5,19,70],"been":[6,20],"of":[7,57,107,130],"major":[8],"interest":[9],"in":[10,22,73,134],"artificial":[11],"intelligence":[12],"research.":[13],"While":[14],"self-supervised":[15,46,74,84,119],"speech":[16,24,77],"representation":[17,34,48],"popular":[21],"the":[23,64,88,96,128,145],"research":[25],"community,":[26],"very":[27],"few":[28],"works":[29],"have":[30],"comprehensively":[31],"analyzed":[32],"audio":[33,38,47,60],"for":[36,76,90,149],"non-speech":[37,59],"tasks.":[39,61,141],"In":[40],"this":[41,115],"paper,":[42],"we":[43,99],"propose":[44],"a":[45,55,101,111,135],"method":[50],"and":[51,152],"apply":[52],"it":[53],"to":[54],"variety":[56],"downstream":[58,140],"We":[62,142],"combine":[63],"well-known":[65],"wav2vec":[66],"2.0":[67],"framework,":[68],"which":[69,109],"shown":[71],"success":[72],"tasks,":[78],"with":[79],"parameter-efficient":[80],"conformer":[81],"architectures.":[82],"Our":[83,121],"pre-training":[85,151],"can":[86],"reduce":[87],"need":[89],"labeled":[91],"by":[93],"two-thirds.":[94],"On":[95],"AudioSet":[97],"benchmark,":[98],"achieve":[100],"mean":[102],"average":[103],"precision":[104],"(mAP)":[105],"score":[106],"0.415,":[108],"is":[110],"new":[112],"state-of-the-art":[113],"on":[114,138],"dataset":[116],"through":[117],"audio-only":[118],"learning.":[120],"fine-tuned":[122],"conformers":[123],"also":[124],"surpass":[125],"or":[126],"match":[127],"performance":[129],"previous":[131],"systems":[132],"pre-trained":[133],"supervised":[136],"way":[137],"several":[139],"further":[143],"discuss":[144],"important":[146],"design":[147],"considerations":[148],"both":[150],"fine-tuning.":[153]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":2},{"year":2022,"cited_by_count":3}],"updated_date":"2026-03-06T13:50:29.536080","created_date":"2025-10-10T00:00:00"}
