{"id":"https://openalex.org/W3007552021","doi":"https://doi.org/10.1109/icassp40776.2020.9054057","title":"Disentangled Speech Embeddings Using Cross-Modal Self-Supervision","display_name":"Disentangled Speech Embeddings Using Cross-Modal Self-Supervision","publication_year":2020,"publication_date":"2020-04-09","ids":{"openalex":"https://openalex.org/W3007552021","doi":"https://doi.org/10.1109/icassp40776.2020.9054057","mag":"3007552021"},"language":"en","primary_location":{"id":"doi:10.1109/icassp40776.2020.9054057","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9054057","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2002.08742","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036002448","display_name":"Arsha Nagrani","orcid":"https://orcid.org/0000-0003-2190-9013"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Arsha Nagrani","raw_affiliation_strings":["Visual Geometry Group, University of Oxford","University of Oxford,Visual Geometry Group"],"affiliations":[{"raw_affiliation_string":"Visual Geometry Group, University of Oxford","institution_ids":["https://openalex.org/I40120149"]},{"raw_affiliation_string":"University of Oxford,Visual Geometry Group","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038723822","display_name":"Joon Son Chung","orcid":"https://orcid.org/0000-0001-7741-7275"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Joon Son Chung","raw_affiliation_strings":["Visual Geometry Group, University of Oxford","University of Oxford,Visual Geometry Group"],"affiliations":[{"raw_affiliation_string":"Visual Geometry Group, University of Oxford","institution_ids":["https://openalex.org/I40120149"]},{"raw_affiliation_string":"University of Oxford,Visual Geometry Group","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102018867","display_name":"Samuel Albanie","orcid":"https://orcid.org/0000-0003-1732-9198"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Samuel Albanie","raw_affiliation_strings":["Visual Geometry Group, University of Oxford","University of Oxford,Visual Geometry Group"],"affiliations":[{"raw_affiliation_string":"Visual Geometry Group, University of Oxford","institution_ids":["https://openalex.org/I40120149"]},{"raw_affiliation_string":"University of Oxford,Visual Geometry Group","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5057678172","display_name":"Andrew Zisserman","orcid":"https://orcid.org/0000-0002-8945-8573"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Andrew Zisserman","raw_affiliation_strings":["Visual Geometry Group, University of Oxford","University of Oxford,Visual Geometry Group"],"affiliations":[{"raw_affiliation_string":"Visual Geometry Group, University of Oxford","institution_ids":["https://openalex.org/I40120149"]},{"raw_affiliation_string":"University of Oxford,Visual Geometry Group","institution_ids":["https://openalex.org/I40120149"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5036002448"],"corresponding_institution_ids":["https://openalex.org/I40120149"],"apc_list":null,"apc_paid":null,"fwci":0.6087,"has_fulltext":true,"cited_by_count":5,"citation_normalized_percentile":{"value":0.65122793,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"6829","last_page":"6833"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.760063111782074},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.7374520301818848},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.7341434955596924},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.670914888381958},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.641573429107666},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5648344159126282},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.521040141582489},{"id":"https://openalex.org/keywords/speaker-recognition","display_name":"Speaker recognition","score":0.5066319108009338},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.5011999607086182},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4694383144378662},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.45813703536987305},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.455910325050354},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.4134637415409088},{"id":"https://openalex.org/keywords/speaker-diarisation","display_name":"Speaker diarisation","score":0.41243448853492737}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.760063111782074},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.7374520301818848},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.7341434955596924},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.670914888381958},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.641573429107666},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5648344159126282},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.521040141582489},{"id":"https://openalex.org/C133892786","wikidata":"https://www.wikidata.org/wiki/Q1145189","display_name":"Speaker recognition","level":2,"score":0.5066319108009338},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.5011999607086182},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4694383144378662},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.45813703536987305},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.455910325050354},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.4134637415409088},{"id":"https://openalex.org/C149838564","wikidata":"https://www.wikidata.org/wiki/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.41243448853492737},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0}],"mesh":[],"locations_count":5,"locations":[{"id":"doi:10.1109/icassp40776.2020.9054057","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp40776.2020.9054057","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2002.08742","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.08742","pdf_url":"https://arxiv.org/pdf/2002.08742","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"mag:3007552021","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2002.08742.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2002.08742","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2002.08742","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"},{"id":"doi:10.17023/m0dw-0x80","is_oa":true,"landing_page_url":"https://doi.org/10.17023/m0dw-0x80","pdf_url":null,"source":{"id":"https://openalex.org/S7407051697","display_name":"IEEE RESOURCE CENTERS","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2002.08742","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2002.08742","pdf_url":"https://arxiv.org/pdf/2002.08742","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5}],"awards":[{"id":"https://openalex.org/G1277543710","display_name":null,"funder_award_id":"EP/M013774/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G2018133609","display_name":"Seebibyte: Visual Search for the Era of Big Data","funder_award_id":"EP/M013774/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"},{"id":"https://openalex.org/G4732936841","display_name":"ExTOL: End to End Translation of British Sign Language","funder_award_id":"EP/R03298X/1","funder_id":"https://openalex.org/F4320334627","funder_display_name":"Engineering and Physical Sciences Research Council"}],"funders":[{"id":"https://openalex.org/F4320334627","display_name":"Engineering and Physical Sciences Research Council","ror":"https://ror.org/0439y7842"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3007552021.pdf","grobid_xml":"https://content.openalex.org/works/W3007552021.grobid-xml"},"referenced_works_count":47,"referenced_works":["https://openalex.org/W114193738","https://openalex.org/W592244745","https://openalex.org/W1635512741","https://openalex.org/W1691728462","https://openalex.org/W1933340426","https://openalex.org/W2214409633","https://openalex.org/W2510842514","https://openalex.org/W2510867321","https://openalex.org/W2511428026","https://openalex.org/W2604379605","https://openalex.org/W2619697695","https://openalex.org/W2623327532","https://openalex.org/W2726515241","https://openalex.org/W2752796333","https://openalex.org/W2803113447","https://openalex.org/W2808631503","https://openalex.org/W2886300652","https://openalex.org/W2890680318","https://openalex.org/W2916104401","https://openalex.org/W2950864153","https://openalex.org/W2962756039","https://openalex.org/W2962824709","https://openalex.org/W2963173190","https://openalex.org/W2963226019","https://openalex.org/W2963279312","https://openalex.org/W2963571336","https://openalex.org/W2963799213","https://openalex.org/W2963801643","https://openalex.org/W2963887950","https://openalex.org/W2964048159","https://openalex.org/W2964307722","https://openalex.org/W6604666349","https://openalex.org/W6617744952","https://openalex.org/W6637412569","https://openalex.org/W6640284076","https://openalex.org/W6648737282","https://openalex.org/W6712395597","https://openalex.org/W6718140377","https://openalex.org/W6725939724","https://openalex.org/W6729831399","https://openalex.org/W6735927292","https://openalex.org/W6738607494","https://openalex.org/W6738806211","https://openalex.org/W6746892368","https://openalex.org/W6751966083","https://openalex.org/W6754048563","https://openalex.org/W6754610156"],"related_works":["https://openalex.org/W3015734344","https://openalex.org/W3024962219","https://openalex.org/W3163320179","https://openalex.org/W3209871323","https://openalex.org/W3159257553","https://openalex.org/W3202022704","https://openalex.org/W3156977108","https://openalex.org/W3165911853","https://openalex.org/W3093494400","https://openalex.org/W3139240716","https://openalex.org/W3123226376","https://openalex.org/W2809767522","https://openalex.org/W3159929162","https://openalex.org/W3207446623","https://openalex.org/W3032892481","https://openalex.org/W2964131701","https://openalex.org/W3205316331","https://openalex.org/W3197287583","https://openalex.org/W2991487804","https://openalex.org/W2914008206"],"abstract_inverted_index":{"The":[0,39],"objective":[1,26],"of":[2,9,51,91,114],"this":[3],"paper":[4],"is":[5,45],"to":[6,14,46,68,88],"learn":[7],"representations":[8,50,100,129],"speaker":[10,55,98,128,132],"identity":[11,94,99],"without":[12],"access":[13],"manually":[15],"annotated":[16],"data.":[17],"To":[18],"do":[19],"so,":[20],"we":[21],"develop":[22],"a":[23,59,74,110],"self-supervised":[24],"learning":[25],"that":[27,101],"exploits":[28],"the":[29,83,118,126],"natural":[30,75],"cross-modal":[31],"synchrony":[32],"between":[33],"faces":[34],"and":[35,54,71,93,95,120],"audio":[36],"in":[37],"video.":[38],"key":[40],"idea":[41],"behind":[42],"our":[43,107],"approach":[44],"tease":[47],"apart-without":[48],"annotation-the":[49],"linguistic":[52],"content":[53,92],"identity.":[56],"We":[57,105],"construct":[58],"two-stream":[60],"architecture":[61],"which:":[62],"(1)":[63],"shares":[64],"low-level":[65],"features":[66],"common":[67],"both":[69],"representations;":[70],"(2)":[72],"provides":[73],"mechanism":[76],"for":[77,85,130],"explicitly":[78],"disentangling":[79],"these":[80],"factors,":[81],"offering":[82],"potential":[84],"greater":[86],"generalisation":[87],"novel":[89],"combinations":[90],"ultimately":[96],"producing":[97],"are":[102],"more":[103],"robust.":[104],"train":[106],"method":[108],"on":[109],"large-scale":[111],"audio-visual":[112],"dataset":[113],"talking":[115],"heads":[116],"`in":[117],"wild',":[119],"demonstrate":[121],"its":[122],"efficacy":[123],"by":[124],"evaluating":[125],"learned":[127],"standard":[131],"recognition":[133],"performance.":[134]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
