{"id":"https://openalex.org/W4225310486","doi":"https://doi.org/10.1109/icassp43922.2022.9747262","title":"Multi-Speaker Pitch Tracking via Embodied Self-Supervised Learning","display_name":"Multi-Speaker Pitch Tracking via Embodied Self-Supervised Learning","publication_year":2022,"publication_date":"2022-04-27","ids":{"openalex":"https://openalex.org/W4225310486","doi":"https://doi.org/10.1109/icassp43922.2022.9747262"},"language":"en","primary_location":{"id":"doi:10.1109/icassp43922.2022.9747262","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747262","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100706634","display_name":"Xiang Li","orcid":"https://orcid.org/0000-0003-2620-9788"},"institutions":[{"id":"https://openalex.org/I4210147617","display_name":"Ministry of Education","ror":"https://ror.org/0429te836","country_code":"ET","type":"government","lineage":["https://openalex.org/I4210147617"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN","ET"],"is_corresponding":true,"raw_author_name":"Xiang Li","raw_affiliation_strings":["Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","institution_ids":["https://openalex.org/I4210147617"]},{"raw_affiliation_string":"Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056740310","display_name":"Yifan Sun","orcid":"https://orcid.org/0000-0002-0801-7697"},"institutions":[{"id":"https://openalex.org/I4210147617","display_name":"Ministry of Education","ror":"https://ror.org/0429te836","country_code":"ET","type":"government","lineage":["https://openalex.org/I4210147617"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN","ET"],"is_corresponding":false,"raw_author_name":"Yifan Sun","raw_affiliation_strings":["Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","institution_ids":["https://openalex.org/I4210147617"]},{"raw_affiliation_string":"Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084685506","display_name":"Xihong Wu","orcid":"https://orcid.org/0009-0004-5236-7469"},"institutions":[{"id":"https://openalex.org/I4210147617","display_name":"Ministry of Education","ror":"https://ror.org/0429te836","country_code":"ET","type":"government","lineage":["https://openalex.org/I4210147617"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN","ET"],"is_corresponding":false,"raw_author_name":"Xihong Wu","raw_affiliation_strings":["Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","institution_ids":["https://openalex.org/I4210147617"]},{"raw_affiliation_string":"Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100394911","display_name":"Jing Chen","orcid":"https://orcid.org/0000-0002-6640-7140"},"institutions":[{"id":"https://openalex.org/I4210147617","display_name":"Ministry of Education","ror":"https://ror.org/0429te836","country_code":"ET","type":"government","lineage":["https://openalex.org/I4210147617"]},{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN","ET"],"is_corresponding":false,"raw_author_name":"Jing Chen","raw_affiliation_strings":["Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Peking University,Key Laboratory of Machine Perception (Ministry of Education),Department of Machine Intelligence, Speech and Hearing Research Center,Beijing,China","institution_ids":["https://openalex.org/I4210147617"]},{"raw_affiliation_string":"Department of Machine Intelligence, Speech and Hearing Research Center, Key Laboratory of Machine Perception (Ministry of Education), Peking University, Beijing, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5100706634"],"corresponding_institution_ids":["https://openalex.org/I20231570","https://openalex.org/I4210147617"],"apc_list":null,"apc_paid":null,"fwci":0.2458,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.35007899,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"5","issue":null,"first_page":"8257","last_page":"8261"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7414228916168213},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.689460813999176},{"id":"https://openalex.org/keywords/gesture","display_name":"Gesture","score":0.555491030216217},{"id":"https://openalex.org/keywords/vocal-tract","display_name":"Vocal tract","score":0.5469167828559875},{"id":"https://openalex.org/keywords/coarticulation","display_name":"Coarticulation","score":0.5376101732254028},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.45650655031204224},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.430942565202713},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.41497594118118286},{"id":"https://openalex.org/keywords/vowel","display_name":"Vowel","score":0.13586780428886414}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7414228916168213},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.689460813999176},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.555491030216217},{"id":"https://openalex.org/C47401133","wikidata":"https://www.wikidata.org/wiki/Q748953","display_name":"Vocal tract","level":2,"score":0.5469167828559875},{"id":"https://openalex.org/C130727458","wikidata":"https://www.wikidata.org/wiki/Q1639109","display_name":"Coarticulation","level":3,"score":0.5376101732254028},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.45650655031204224},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.430942565202713},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41497594118118286},{"id":"https://openalex.org/C2779581591","wikidata":"https://www.wikidata.org/wiki/Q36244","display_name":"Vowel","level":2,"score":0.13586780428886414},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp43922.2022.9747262","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp43922.2022.9747262","pdf_url":null,"source":{"id":"https://openalex.org/S4363607702","display_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5699999928474426}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W1875231349","https://openalex.org/W2015143272","https://openalex.org/W2091425152","https://openalex.org/W2115129089","https://openalex.org/W2119599673","https://openalex.org/W2143341575","https://openalex.org/W2150423550","https://openalex.org/W2400225965","https://openalex.org/W2602871151","https://openalex.org/W2667576870","https://openalex.org/W2728212668","https://openalex.org/W2734774145","https://openalex.org/W2889992917","https://openalex.org/W2952218014","https://openalex.org/W3037959216","https://openalex.org/W3099330747","https://openalex.org/W4299729612","https://openalex.org/W6639350448","https://openalex.org/W6779985876"],"related_works":["https://openalex.org/W2328114820","https://openalex.org/W1968175653","https://openalex.org/W2462948091","https://openalex.org/W2034120491","https://openalex.org/W275815731","https://openalex.org/W2022163073","https://openalex.org/W159384616","https://openalex.org/W2138807150","https://openalex.org/W2001141117","https://openalex.org/W1503295210"],"abstract_inverted_index":{"Pitch":[0,104],"is":[1,47,63,66,85,106,123],"a":[2,24,41,126,170],"critical":[3],"cue":[4],"in":[5,15,19,49,125],"human":[6,74],"speech":[7,17,39,65,83],"perception.":[8],"Although":[9],"the":[10,35,78,87,93,98,109,114],"technology":[11],"of":[12,38,97,108,117],"tracking":[13,45],"pitch":[14,29,44,155],"single-talker":[16],"succeeds":[18],"many":[20],"applications,":[21],"it\u2019s":[22],"still":[23],"challenging":[25],"problem":[26],"to":[27,113,129,180],"extract":[28],"information":[30,156],"from":[31,100,142],"mixtures.":[32],"Inspired":[33],"by":[34,134],"motor":[36],"theory":[37],"perception,":[40],"novel":[42],"multi-speaker":[43],"approach":[46],"proposed":[48],"this":[50,143],"work,":[51],"based":[52],"on":[53,162],"an":[54,69,131],"embodied":[55],"self-supervised":[56,127],"learning":[57],"method":[58],"(EMSSL-Pitch).":[59],"The":[60,120,139],"conceptual":[61],"idea":[62],"that":[64,166],"produced":[67],"through":[68],"underlying":[70],"physical":[71,149],"process":[72],"(i.e.,":[73],"vocal":[75,118],"tract)":[76],"given":[77],"articulatory":[79,95,110,152],"parameters":[80,153],"(articulatory-to-acoustic),":[81],"while":[82],"perception":[84],"like":[86],"inverse":[88],"process,":[89],"aiming":[90],"at":[91],"perceiving":[92],"intended":[94],"gestures":[96],"speaker":[99],"acoustic":[101],"signals":[102],"(acoustic-to-articulatory).":[103],"value":[105],"part":[107],"parameters,":[111],"corresponding":[112],"vibration":[115],"frequency":[116],"folders.":[119],"acoustic-to-articulatory":[121],"inversion":[122],"modeled":[124],"manner":[128],"learn":[130],"inference":[132,144],"network":[133,145],"iteratively":[135],"sampling":[136],"and":[137,177],"training.":[138],"learned":[140],"representations":[141],"can":[146,157,168],"have":[147],"explicit":[148],"meanings,":[150],"i.e.,":[151],"where":[154],"be":[158,178],"further":[159],"extracted.":[160],"Experiments":[161],"GRID":[163],"database":[164],"show":[165],"EMSSL-Pitch":[167],"achieve":[169],"reachable":[171],"performance":[172],"compared":[173],"with":[174],"supervised":[175],"baselines":[176],"generalized":[179],"unseen":[181],"speakers.":[182]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
