{"id":"https://openalex.org/W4372265901","doi":"https://doi.org/10.1109/icassp49357.2023.10095883","title":"Cross-Modal Audio-Visual Co-Learning for Text-Independent Speaker Verification","display_name":"Cross-Modal Audio-Visual Co-Learning for Text-Independent Speaker Verification","publication_year":2023,"publication_date":"2023-05-05","ids":{"openalex":"https://openalex.org/W4372265901","doi":"https://doi.org/10.1109/icassp49357.2023.10095883"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49357.2023.10095883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100457438","display_name":"Meng Liu","orcid":"https://orcid.org/0000-0002-1582-5764"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Meng Liu","raw_affiliation_strings":["Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004287909","display_name":"Kong Aik Lee","orcid":"https://orcid.org/0000-0001-9133-3000"},"institutions":[{"id":"https://openalex.org/I3005327000","display_name":"Institute for Infocomm Research","ror":"https://ror.org/053rfa017","country_code":"SG","type":"facility","lineage":["https://openalex.org/I115228651","https://openalex.org/I3005327000","https://openalex.org/I91275662"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Kong Aik Lee","raw_affiliation_strings":["A&#x22C6;STAR,Institute for Infocomm Research,Singapore"],"affiliations":[{"raw_affiliation_string":"A&#x22C6;STAR,Institute for Infocomm Research,Singapore","institution_ids":["https://openalex.org/I3005327000"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101745213","display_name":"Longbiao Wang","orcid":"https://orcid.org/0000-0002-8094-6861"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longbiao Wang","raw_affiliation_strings":["Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041985945","display_name":"Hanyi Zhang","orcid":"https://orcid.org/0000-0003-0109-6481"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hanyi Zhang","raw_affiliation_strings":["Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100654128","display_name":"Chang Zeng","orcid":"https://orcid.org/0000-0002-4882-1823"},"institutions":[{"id":"https://openalex.org/I184597095","display_name":"National Institute of Informatics","ror":"https://ror.org/04ksd4g47","country_code":"JP","type":"facility","lineage":["https://openalex.org/I1319490839","https://openalex.org/I184597095","https://openalex.org/I4210158934"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Chang Zeng","raw_affiliation_strings":["National Institute of Informatics,Tokyo,Japan","National Institute of Informatics, Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"National Institute of Informatics,Tokyo,Japan","institution_ids":["https://openalex.org/I184597095"]},{"raw_affiliation_string":"National Institute of Informatics, Tokyo, Japan","institution_ids":["https://openalex.org/I184597095"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5017251198","display_name":"Jianwu Dang","orcid":"https://orcid.org/0000-0002-9237-4821"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianwu Dang","raw_affiliation_strings":["Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China"],"affiliations":[{"raw_affiliation_string":"Tianjin University,College of Intelligence and Computing,Tianjin Key Laboratory of Cognitive Computing and Application,Tianjin,China","institution_ids":["https://openalex.org/I162868743"]},{"raw_affiliation_string":"Tianjin Key Laboratory of Cognitive Computing and Application, College of Intelligence and Computing, Tianjin University, Tianjin, China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100457438"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":3.168,"has_fulltext":false,"cited_by_count":16,"citation_normalized_percentile":{"value":0.92683833,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7951377630233765},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6658375859260559},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.6587245464324951},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.6497105956077576},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4846780598163605},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4681693911552429},{"id":"https://openalex.org/keywords/correlation","display_name":"Correlation","score":0.45002466440200806},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.44363996386528015},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.42368364334106445},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.4174948036670685},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3335384130477905},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09453877806663513},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.07669097185134888}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7951377630233765},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6658375859260559},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.6587245464324951},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.6497105956077576},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4846780598163605},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4681693911552429},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.45002466440200806},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.44363996386528015},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.42368364334106445},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.4174948036670685},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3335384130477905},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09453877806663513},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.07669097185134888},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49357.2023.10095883","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49357.2023.10095883","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5699999928474426}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1664547674","https://openalex.org/W1974783905","https://openalex.org/W2015143272","https://openalex.org/W2053365209","https://openalex.org/W2143231843","https://openalex.org/W2219249508","https://openalex.org/W2696967604","https://openalex.org/W2726515241","https://openalex.org/W2766216301","https://openalex.org/W2794506738","https://openalex.org/W2808631503","https://openalex.org/W2810311710","https://openalex.org/W2890964092","https://openalex.org/W2891205112","https://openalex.org/W2936774411","https://openalex.org/W2963066927","https://openalex.org/W2963658982","https://openalex.org/W2969985801","https://openalex.org/W2985076077","https://openalex.org/W3024869864","https://openalex.org/W3048939150","https://openalex.org/W3097741049","https://openalex.org/W3099638501","https://openalex.org/W3113824005","https://openalex.org/W3126757411","https://openalex.org/W3132336199","https://openalex.org/W3162707322","https://openalex.org/W3163527109","https://openalex.org/W3184679245","https://openalex.org/W4210588066","https://openalex.org/W4221154745","https://openalex.org/W4297841864","https://openalex.org/W6637357238","https://openalex.org/W6688816777","https://openalex.org/W6740167877","https://openalex.org/W6754420807","https://openalex.org/W6777437564"],"related_works":["https://openalex.org/W2271369634","https://openalex.org/W2350550760","https://openalex.org/W578794879","https://openalex.org/W2625296515","https://openalex.org/W2385859805","https://openalex.org/W3137890128","https://openalex.org/W4245955731","https://openalex.org/W2393726419","https://openalex.org/W2380912101","https://openalex.org/W627697492"],"abstract_inverted_index":{"Visual":[0],"speech":[1,10,18,29],"(i.e.,":[2],"lip":[3],"motion)":[4],"is":[5,40,76,87],"highly":[6],"related":[7],"to":[8,12,63],"auditory":[9],"due":[11],"the":[13,65,99],"co-occurrence":[14],"and":[15,25,81,92,110,119],"synchronization":[16],"in":[17],"production.":[19],"This":[20],"paper":[21],"investigates":[22],"this":[23],"correlation":[24],"proposes":[26],"a":[27,71],"cross-modal":[28,37,53],"co-learning":[30,38],"paradigm.":[31],"The":[32,85],"primary":[33],"motivation":[34],"of":[35],"our":[36,104],"method":[39,106],"modeling":[41],"one":[42],"modality":[43,79],"aided":[44],"by":[45],"exploiting":[46],"knowledge":[47],"from":[48,90],"another":[49],"modality.":[50],"Specifically,":[51],"two":[52],"boosters":[54],"are":[55],"introduced":[56],"based":[57],"on":[58,98],"an":[59],"audio-visual":[60],"pseudo-siamese":[61],"structure":[62],"learn":[64],"modality-transformed":[66],"correlation.":[67],"Inside":[68],"each":[69],"booster,":[70],"max-feature-map":[72],"embedded":[73],"Transformer":[74],"variant":[75],"proposed":[77,105],"for":[78],"alignment":[80],"enhanced":[82],"feature":[83],"generation.":[84],"network":[86],"co-learned":[88],"both":[89],"scratch":[91],"with":[93],"pretrained":[94],"models.":[95],"Experimental":[96],"results":[97],"test":[100],"scenarios":[101],"demonstrate":[102],"that":[103],"achieves":[107],"around":[108],"60%":[109],"20%":[111],"average":[112],"relative":[113],"performance":[114],"improvement":[115],"over":[116],"baseline":[117],"unimodal":[118],"fusion":[120],"systems,":[121],"respectively.":[122]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":6},{"year":2023,"cited_by_count":2}],"updated_date":"2026-04-01T17:29:45.350535","created_date":"2025-10-10T00:00:00"}
