{"id":"https://openalex.org/W4290712827","doi":"https://doi.org/10.1109/jstsp.2022.3197315","title":"Non-Contrastive Self-Supervised Learning for Utterance-Level Information Extraction From Speech","display_name":"Non-Contrastive Self-Supervised Learning for Utterance-Level Information Extraction From Speech","publication_year":2022,"publication_date":"2022-08-08","ids":{"openalex":"https://openalex.org/W4290712827","doi":"https://doi.org/10.1109/jstsp.2022.3197315"},"language":"en","primary_location":{"id":"doi:10.1109/jstsp.2022.3197315","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2022.3197315","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2208.05445","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005145356","display_name":"Jaejin Cho","orcid":"https://orcid.org/0000-0002-0527-5391"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jaejin Cho","raw_affiliation_strings":["Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA"],"raw_orcid":"https://orcid.org/0000-0002-0527-5391","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057010207","display_name":"Jes\u00fas Villalba","orcid":"https://orcid.org/0000-0001-9459-8426"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jesus Villalba","raw_affiliation_strings":["Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","Human Language Technology Center of Excellence, Johns Hopkins University, Baltimore, MD, USA"],"raw_orcid":"https://orcid.org/0000-0001-9459-8426","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]},{"raw_affiliation_string":"Human Language Technology Center of Excellence, Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069488212","display_name":"Laureano Moro-Vel\u00e1zquez","orcid":"https://orcid.org/0000-0002-3033-7005"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Laureano Moro-Velazquez","raw_affiliation_strings":["Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA"],"raw_orcid":"https://orcid.org/0000-0002-3033-7005","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050632169","display_name":"Najim Dehak","orcid":"https://orcid.org/0000-0002-4489-5753"},"institutions":[{"id":"https://openalex.org/I145311948","display_name":"Johns Hopkins University","ror":"https://ror.org/00za53h95","country_code":"US","type":"education","lineage":["https://openalex.org/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Najim Dehak","raw_affiliation_strings":["Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","Human Language Technology Center of Excellence, Johns Hopkins University, Baltimore, MD, USA"],"raw_orcid":"https://orcid.org/0000-0002-4489-5753","affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, and the Center for Language and Speech Processing (CLSP), Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]},{"raw_affiliation_string":"Human Language Technology Center of Excellence, Johns Hopkins University, Baltimore, MD, USA","institution_ids":["https://openalex.org/I145311948"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5005145356"],"corresponding_institution_ids":["https://openalex.org/I145311948"],"apc_list":null,"apc_paid":null,"fwci":2.4971,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.90675532,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"16","issue":"6","first_page":"1284","last_page":"1295"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.8163965344429016},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.792411208152771},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6528617739677429},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.6102643013000488},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.609798789024353},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.6091059446334839},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4942772388458252},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.4757501780986786},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.4469437599182129},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.43264034390449524},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.4171673655509949},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4101177453994751}],"concepts":[{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.8163965344429016},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.792411208152771},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6528617739677429},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.6102643013000488},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.609798789024353},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.6091059446334839},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4942772388458252},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4757501780986786},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.4469437599182129},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.43264034390449524},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4171673655509949},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4101177453994751},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C94625758","wikidata":"https://www.wikidata.org/wiki/Q7163","display_name":"Politics","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/jstsp.2022.3197315","is_oa":false,"landing_page_url":"https://doi.org/10.1109/jstsp.2022.3197315","pdf_url":null,"source":{"id":"https://openalex.org/S42167783","display_name":"IEEE Journal of Selected Topics in Signal Processing","issn_l":"1932-4553","issn":["1932-4553","1941-0484"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Journal of Selected Topics in Signal Processing","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2208.05445","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2208.05445","pdf_url":"https://arxiv.org/pdf/2208.05445","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2208.05445","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2208.05445","pdf_url":"https://arxiv.org/pdf/2208.05445","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.6499999761581421,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W1522301498","https://openalex.org/W1589137271","https://openalex.org/W2086161653","https://openalex.org/W2146334809","https://openalex.org/W2194775991","https://openalex.org/W2696967604","https://openalex.org/W2726515241","https://openalex.org/W2748488820","https://openalex.org/W2758785877","https://openalex.org/W2808631503","https://openalex.org/W2890964092","https://openalex.org/W2896457183","https://openalex.org/W2911489562","https://openalex.org/W2928165649","https://openalex.org/W2953070460","https://openalex.org/W2963182768","https://openalex.org/W2969985801","https://openalex.org/W2972705840","https://openalex.org/W2972943112","https://openalex.org/W2979593053","https://openalex.org/W2982223350","https://openalex.org/W3002741552","https://openalex.org/W3015213852","https://openalex.org/W3015707499","https://openalex.org/W3016175755","https://openalex.org/W3036601975","https://openalex.org/W3041561163","https://openalex.org/W3044308976","https://openalex.org/W3097288651","https://openalex.org/W3097448764","https://openalex.org/W3100859887","https://openalex.org/W3154143698","https://openalex.org/W3159481202","https://openalex.org/W3160397447","https://openalex.org/W3160799772","https://openalex.org/W3161606033","https://openalex.org/W3162890625","https://openalex.org/W3197368257","https://openalex.org/W3197580070","https://openalex.org/W3198858531","https://openalex.org/W3207346153","https://openalex.org/W4205234379","https://openalex.org/W4286981691","https://openalex.org/W4385245566","https://openalex.org/W6631190155","https://openalex.org/W6733814495","https://openalex.org/W6739901393","https://openalex.org/W6745117592","https://openalex.org/W6779326418","https://openalex.org/W6779977557","https://openalex.org/W6780218876","https://openalex.org/W6781368565","https://openalex.org/W6784400926"],"related_works":["https://openalex.org/W2953234277","https://openalex.org/W2626256601","https://openalex.org/W147410782","https://openalex.org/W2900413183","https://openalex.org/W2529301793","https://openalex.org/W4390975304","https://openalex.org/W2384121599","https://openalex.org/W2038083449","https://openalex.org/W3022252430","https://openalex.org/W3148366653"],"abstract_inverted_index":{"In":[0,15],"recent":[1],"studies,":[2],"self-supervised":[3,17,46,68,123],"pre-trained":[4,10],"models":[5,11,61],"tend":[6],"to":[7,71,91,109,125,139,153,162],"outperform":[8],"supervised":[9,158],"in":[12,26,156,248],"transfer":[13,184],"learning.":[14],"particular,":[16],"learning":[18,69,185],"of":[19,33,76,180],"utterance-level":[20,55,73,127],"speech":[21,27,47,165,230],"representation":[22,32,56],"can":[23,51],"be":[24,52],"used":[25,53],"applications":[28],"that":[29,103],"require":[30,147],"discriminative":[31],"consistent":[34],"attributes":[35],"within":[36],"an":[37],"utterance:":[38],"speaker,":[39],"language,":[40],"emotion,":[41],"and":[42,169,204],"age.":[43],"Existing":[44],"frame-level":[45],"representation,":[48],"e.g.,":[49],"wav2vec,":[50],"as":[54,187],"with":[57,93,132,234],"pooling,":[58],"but":[59],"the":[60,77,94,105,113,178,189,199,206],"are":[62,66,238],"usually":[63],"large.":[64],"There":[65],"also":[67],"techniques":[70],"learn":[72,126],"representation.":[74],"One":[75],"most":[78],"successful":[79],"is":[80],"a":[81,121,157,235],"contrastive":[82,142],"method,":[83],"which":[84],"requires":[85],"negative":[86,106,148],"sampling:":[87],"selecting":[88],"alternative":[89],"samples":[90,107],"contrast":[92],"current":[95],"sample":[96],"(anchor).":[97],"However,":[98],"this":[99],"does":[100,145],"not":[101,146,225],"ensure":[102],"all":[104,211],"belong":[108],"classes":[110],"different":[111],"from":[112,136],"anchor":[114],"class":[115],"without":[116],"labels.":[117],"This":[118],"paper":[119],"applies":[120],"non-contrastive":[122],"method":[124],"embeddings.":[128],"We":[129,150,176],"adapted":[130],"DIstillation":[131],"NO":[133],"labels":[134],"(DINO)":[135],"computer":[137],"vision":[138],"speech.":[140],"Unlike":[141],"methods,":[143],"DINO":[144,152,173],"sampling.":[149],"compared":[151],"x-vector":[154],"trained":[155],"manner.":[159],"When":[160],"transferred":[161],"speaker":[163],"verification,":[164],"emotion":[166],"recognition":[167],"(SER),":[168],"Alzheimer's":[170],"disease":[171],"detection,":[172],"outperformed":[174],"x-vector.":[175],"studied":[177],"influence":[179],"several":[181],"aspects":[182],"during":[183],"such":[186],"dividing":[188],"fine-tuning":[190,210],"process":[191],"into":[192],"steps,":[193],"chunk":[194,216],"lengths,":[195,217],"or":[196],"augmentation.":[197],"Fine-tuning":[198],"last":[200],"affine":[201],"layers":[202],"first":[203],"then":[205],"whole":[207],"network":[208],"surpassed":[209],"at":[212,232],"once.":[213],"Using":[214],"shorter":[215],"although":[218],"they":[219],"generate":[220],"more":[221],"diverse":[222],"inputs,":[223],"did":[224],"necessarily":[226],"improve":[227],"performance,":[228],"implying":[229],"segments":[231],"least":[233],"specific":[236],"length":[237],"required":[239],"for":[240],"better":[241],"performance":[242],"per":[243],"application.":[244],"Augmentation":[245],"was":[246],"helpful":[247],"SER.":[249]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2022-08-09T00:00:00"}
