{"id":"https://openalex.org/W4391307167","doi":"https://doi.org/10.1109/icspcc59353.2023.10400358","title":"Cycle-Consistent Generative Adversarial Network Architectures for Audio Visual Speech Recognition","display_name":"Cycle-Consistent Generative Adversarial Network Architectures for Audio Visual Speech Recognition","publication_year":2023,"publication_date":"2023-11-14","ids":{"openalex":"https://openalex.org/W4391307167","doi":"https://doi.org/10.1109/icspcc59353.2023.10400358"},"language":"en","primary_location":{"id":"doi:10.1109/icspcc59353.2023.10400358","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icspcc59353.2023.10400358","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Signal Processing, Communications and Computing (ICSPCC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101865953","display_name":"Yibo He","orcid":"https://orcid.org/0000-0002-6306-0647"},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yibo He","raw_affiliation_strings":["School of AI and Advanced Computing, Xi&#x0027;an Jiaotong Liverpool University (XJTLU),Suzhou,China"],"affiliations":[{"raw_affiliation_string":"School of AI and Advanced Computing, Xi&#x0027;an Jiaotong Liverpool University (XJTLU),Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111525101","display_name":"Kah Phooi Seng","orcid":null},"institutions":[{"id":"https://openalex.org/I69356397","display_name":"Xi\u2019an Jiaotong-Liverpool University","ror":"https://ror.org/03zmrmn05","country_code":"CN","type":"education","lineage":["https://openalex.org/I69356397"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kah Phooi Seng","raw_affiliation_strings":["School of AI and Advanced Computing, Xi&#x0027;an Jiaotong Liverpool University (XJTLU),Suzhou,China"],"affiliations":[{"raw_affiliation_string":"School of AI and Advanced Computing, Xi&#x0027;an Jiaotong Liverpool University (XJTLU),Suzhou,China","institution_ids":["https://openalex.org/I69356397"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073635669","display_name":"Li-Minn Ang","orcid":"https://orcid.org/0000-0002-2402-7529"},"institutions":[{"id":"https://openalex.org/I174025329","display_name":"University of the Sunshine Coast","ror":"https://ror.org/016gb9e15","country_code":"AU","type":"education","lineage":["https://openalex.org/I174025329"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Li-minn Ang","raw_affiliation_strings":["School of Science, Technology and Engineering, University of the Sunshine Coast,Queensland,Australia","School of Science, Technology and Engineering, University of the Sunshine Coast, Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"School of Science, Technology and Engineering, University of the Sunshine Coast,Queensland,Australia","institution_ids":["https://openalex.org/I174025329"]},{"raw_affiliation_string":"School of Science, Technology and Engineering, University of the Sunshine Coast, Queensland, Australia","institution_ids":["https://openalex.org/I174025329"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100635681","display_name":"Xingyu Zhao","orcid":"https://orcid.org/0000-0002-3474-349X"},"institutions":[{"id":"https://openalex.org/I146655781","display_name":"University of Liverpool","ror":"https://ror.org/04xs57h96","country_code":"GB","type":"education","lineage":["https://openalex.org/I146655781"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Xingyu Zhao","raw_affiliation_strings":["University of Liverpool,Dept of Computer Science,Liverpool,UK","Dept of Computer Science, University of Liverpool, Liverpool, UK"],"affiliations":[{"raw_affiliation_string":"University of Liverpool,Dept of Computer Science,Liverpool,UK","institution_ids":["https://openalex.org/I146655781"]},{"raw_affiliation_string":"Dept of Computer Science, University of Liverpool, Liverpool, UK","institution_ids":["https://openalex.org/I146655781"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101865953"],"corresponding_institution_ids":["https://openalex.org/I69356397"],"apc_list":null,"apc_paid":null,"fwci":0.2033,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.49890468,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10688","display_name":"Image and Signal Denoising Methods","score":0.9772999882698059,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9704999923706055,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.75754714012146},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6814454793930054},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.6311628222465515},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.6004496812820435},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5819895267486572},{"id":"https://openalex.org/keywords/generative-adversarial-network","display_name":"Generative adversarial network","score":0.48889079689979553},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3621874451637268},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.20556941628456116},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.19137045741081238}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75754714012146},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6814454793930054},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.6311628222465515},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6004496812820435},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5819895267486572},{"id":"https://openalex.org/C2988773926","wikidata":"https://www.wikidata.org/wiki/Q25104379","display_name":"Generative adversarial network","level":3,"score":0.48889079689979553},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3621874451637268},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.20556941628456116},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.19137045741081238}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icspcc59353.2023.10400358","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/icspcc59353.2023.10400358","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Signal Processing, Communications and Computing (ICSPCC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W2024868105","https://openalex.org/W2121486117","https://openalex.org/W2171490696","https://openalex.org/W2622064152","https://openalex.org/W2890952074","https://openalex.org/W2891158090","https://openalex.org/W2936774411","https://openalex.org/W2962770929","https://openalex.org/W2962793481","https://openalex.org/W2962974533","https://openalex.org/W2963373786","https://openalex.org/W2963528589","https://openalex.org/W2963785710","https://openalex.org/W3015356123","https://openalex.org/W3043547428","https://openalex.org/W3101639073","https://openalex.org/W3163527109","https://openalex.org/W4298112588","https://openalex.org/W6718379498"],"related_works":["https://openalex.org/W2888032422","https://openalex.org/W2996316059","https://openalex.org/W3178813832","https://openalex.org/W4385421777","https://openalex.org/W4377980832","https://openalex.org/W2897769091","https://openalex.org/W2971552217","https://openalex.org/W3005996785","https://openalex.org/W4297411772","https://openalex.org/W4226298148"],"abstract_inverted_index":{"Generative":[0],"Adversarial":[1,95],"Networks":[2,96],"(GANs)":[3],"have":[4],"found":[5],"extensive":[6],"applications":[7],"in":[8,36,66,72,140,164,185],"image":[9,12],"classification":[10],"and":[11,20,47,76,110,119,127,183,194],"generation":[13],"domains.":[14],"Nevertheless,":[15],"their":[16],"utilisation":[17],"for":[18],"recognising":[19],"detecting":[21],"multimodal":[22,37],"images":[23,150],"presents":[24],"considerable":[25],"difficulties.":[26],"Audio":[27],"Visual":[28],"Speech":[29],"Recognition":[30],"(AVSR)":[31],"is":[32,59],"a":[33,86,91],"classic":[34],"task":[35],"audio-visual":[38],"sensing,":[39],"which":[40],"leverages":[41],"audio":[42],"inputs":[43,50],"from":[44,51,176],"human":[45],"speech":[46],"aligned":[48],"visual":[49,101],"lip":[52],"movements.":[53],"However,":[54],"the":[55,62,100,117,125,131,135,148,154,161,165,170,186,192,197],"performance":[56],"of":[57,121,130,196],"AVSR":[58,87,136],"impacted":[60],"by":[61],"inherent":[63,162],"discrepancies":[64],"present":[65],"real-world":[67],"environments,":[68],"such":[69,107],"as":[70,108],"variations":[71],"lighting":[73],"intensity,":[74],"noise,":[75],"sampling":[77],"devices.":[78],"To":[79,168],"mitigate":[80],"these":[81],"challenges,":[82],"this":[83],"paper":[84],"proposed":[85],"architecture":[88],"based":[89],"on":[90,99],"specially":[92,155],"constructed":[93,156],"Cycle-Consistent":[94],"(CycleGAN).":[97],"First,":[98],"side,":[102],"we":[103,146,172],"used":[104,173],"data-augmentation":[105],"methods":[106],"flipping":[109],"rotating":[111],"to":[112,159],"process":[113],"video":[114],"data,":[115],"increasing":[116],"number":[118],"variety":[120],"samples.":[122],"This":[123],"increases":[124],"robustness":[126],"generalisation":[128],"capabilities":[129],"model.":[132],"Then,":[133],"since":[134],"dataset":[137],"was":[138],"collected":[139],"different":[141,144,166],"environments":[142],"with":[143],"styles,":[145],"transformed":[147],"original":[149],"multiple":[151],"times":[152],"through":[153],"CycleGAN":[157],"module":[158],"address":[160],"differences":[163],"environments.":[167],"validate":[169,191],"approaches,":[171],"augmented":[174],"data":[175],"well-known":[177],"datasets":[178],"(LRS2-Lip":[179],"Reading":[180],"Sentences":[181],"2":[182],"LRS3)":[184],"training":[187],"process.":[188],"Experimental":[189],"results":[190],"correctness":[193],"effectiveness":[195],"approach.":[198]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
