{"id":"https://openalex.org/W3096214032","doi":"https://doi.org/10.21437/interspeech.2020-1697","title":"Multimodal Target Speech Separation with Voice and Face References","display_name":"Multimodal Target Speech Separation with Voice and Face References","publication_year":2020,"publication_date":"2020-10-25","ids":{"openalex":"https://openalex.org/W3096214032","doi":"https://doi.org/10.21437/interspeech.2020-1697","mag":"3096214032"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2020-1697","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1697","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5023714181","display_name":"Leyuan Qu","orcid":"https://orcid.org/0000-0001-6694-5355"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Leyuan Qu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102025003","display_name":"Cornelius Weber","orcid":"https://orcid.org/0000-0001-5163-938X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cornelius Weber","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5033486668","display_name":"Stefan Wermter","orcid":"https://orcid.org/0000-0003-1343-4775"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stefan Wermter","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5023714181"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.4245,"has_fulltext":false,"cited_by_count":23,"citation_normalized_percentile":{"value":0.89769585,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9925000071525574,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10326","display_name":"Indoor and Outdoor Localization Technologies","score":0.934499979019165,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6529101729393005},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.6379656791687012},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6345242857933044},{"id":"https://openalex.org/keywords/separation","display_name":"Separation (statistics)","score":0.5874982476234436},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.32628485560417175},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.11482062935829163},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.0683789849281311}],"concepts":[{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6529101729393005},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6379656791687012},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6345242857933044},{"id":"https://openalex.org/C2776061190","wikidata":"https://www.wikidata.org/wiki/Q7451805","display_name":"Separation (statistics)","level":2,"score":0.5874982476234436},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32628485560417175},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.11482062935829163},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.0683789849281311},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2020-1697","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2020-1697","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2020","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.699999988079071}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2368779261","https://openalex.org/W2778699561","https://openalex.org/W2103413230","https://openalex.org/W2048360808","https://openalex.org/W2355490025","https://openalex.org/W2057064510","https://openalex.org/W2155074382","https://openalex.org/W2908959303","https://openalex.org/W2410151940","https://openalex.org/W4316658612"],"abstract_inverted_index":{"Target":[0],"speech":[1,7,80,128,159],"separation":[2],"refers":[3],"to":[4,60,83,94,105,107,155,166],"isolating":[5],"target":[6,20,58,137],"from":[8,23,119],"a":[9,52,73,85,89,120,136,149],"multi-speaker":[10],"mixture":[11],"signal":[12],"by":[13,96],"conditioning":[14],"on":[15,99],"auxiliary":[16],"information":[17,76,163],"about":[18,77],"the":[19,24,38,48,57,67,70,78,103,127,132,141],"speaker.":[21],"Different":[22],"mainstream":[25],"audio-visual":[26],"approaches":[27],"which":[28,101,130],"usually":[29],"require":[30],"simultaneous":[31,86],"visual":[32,87],"streams":[33],"as":[34],"additional":[35],"input,":[36],"e.g.":[37],"corresponding":[39],"lip":[40],"movement":[41],"sequences,":[42],"in":[43,134,140],"our":[44],"approach":[45],"we":[46,114,170],"propose":[47],"novel":[49],"use":[50],"of":[51,56,72],"single":[53],"face":[54,74,90,116,124,151,162,181],"profile":[55],"speaker":[59,138],"separate":[61],"expected":[62,158],"clean":[63],"speech.":[64],"We":[65],"exploit":[66],"fact":[68],"that":[69,148,172],"image":[71,91,152],"contains":[75],"person's":[79],"sound.":[81],"Compared":[82],"using":[84],"sequence,":[88],"is":[92,153,164],"easier":[93],"obtain":[95],"pre-enrollment":[97],"or":[98],"websites,":[100],"enables":[102],"system":[104,133],"generalize":[106],"devices":[108],"without":[109],"cameras.":[110],"To":[111],"this":[112],"end,":[113],"incorporate":[115],"embeddings":[117],"extracted":[118],"pretrained":[121],"model":[122],"for":[123],"recognition":[125],"into":[126],"separation,":[129],"guide":[131],"predicting":[135],"mask":[139],"time-frequency":[142],"domain.":[143],"The":[144],"experimental":[145],"results":[146],"show":[147,171],"pre-enrolled":[150],"able":[154],"benefit":[156],"separating":[157],"signals.":[160],"Additionally,":[161],"complementary":[165],"voice":[167,183],"reference":[168],"and":[169,182],"further":[173],"improvement":[174],"can":[175],"be":[176],"achieved":[177],"when":[178],"combing":[179],"both":[180],"embeddings.":[184]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
