{"id":"https://openalex.org/W12573367","doi":"https://doi.org/10.21437/interspeech.2004-418","title":"Modeling audio-visual speech perception: back on fusion architectures and fusion control","display_name":"Modeling audio-visual speech perception: back on fusion architectures and fusion control","publication_year":2004,"publication_date":"2004-10-04","ids":{"openalex":"https://openalex.org/W12573367","doi":"https://doi.org/10.21437/interspeech.2004-418","mag":"12573367"},"language":"en","primary_location":{"id":"doi:10.21437/interspeech.2004-418","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2004-418","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2004","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001142101","display_name":"Jean\u2010Luc Schwartz","orcid":"https://orcid.org/0000-0001-8969-9185"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jean-Luc Schwartz","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5089907923","display_name":"Marie Cathiard","orcid":null},"institutions":[{"id":"https://openalex.org/I106785703","display_name":"Institut polytechnique de Grenoble","ror":"https://ror.org/05sbt2524","country_code":"FR","type":"education","lineage":["https://openalex.org/I106785703","https://openalex.org/I899635006"]}],"countries":["FR"],"is_corresponding":false,"raw_author_name":"Marie Cathiard","raw_affiliation_strings":["[Grenoble Institute of Technology]"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"[Grenoble Institute of Technology]","institution_ids":["https://openalex.org/I106785703"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.5735,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.64093714,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"2017","last_page":"2020"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12032","display_name":"Multisensory perception and integration","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7210863828659058},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.655182421207428},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.5826308727264404},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.5756257176399231},{"id":"https://openalex.org/keywords/sensor-fusion","display_name":"Sensor fusion","score":0.503020703792572},{"id":"https://openalex.org/keywords/visual-perception","display_name":"Visual perception","score":0.4657994210720062},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.4419782757759094},{"id":"https://openalex.org/keywords/presentation","display_name":"Presentation (obstetrics)","score":0.4393921196460724},{"id":"https://openalex.org/keywords/cognition","display_name":"Cognition","score":0.43902498483657837},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.41103285551071167},{"id":"https://openalex.org/keywords/cognitive-science","display_name":"Cognitive science","score":0.4029935896396637},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3813525438308716},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3599494695663452},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3564280569553375},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.17786595225334167},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.16001757979393005}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7210863828659058},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.655182421207428},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.5826308727264404},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.5756257176399231},{"id":"https://openalex.org/C33954974","wikidata":"https://www.wikidata.org/wiki/Q486494","display_name":"Sensor fusion","level":2,"score":0.503020703792572},{"id":"https://openalex.org/C178253425","wikidata":"https://www.wikidata.org/wiki/Q162668","display_name":"Visual perception","level":3,"score":0.4657994210720062},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.4419782757759094},{"id":"https://openalex.org/C2777601897","wikidata":"https://www.wikidata.org/wiki/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.4393921196460724},{"id":"https://openalex.org/C169900460","wikidata":"https://www.wikidata.org/wiki/Q2200417","display_name":"Cognition","level":2,"score":0.43902498483657837},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.41103285551071167},{"id":"https://openalex.org/C188147891","wikidata":"https://www.wikidata.org/wiki/Q147638","display_name":"Cognitive science","level":1,"score":0.4029935896396637},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3813525438308716},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3599494695663452},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3564280569553375},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.17786595225334167},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.16001757979393005},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C153349607","wikidata":"https://www.wikidata.org/wiki/Q36649","display_name":"Visual arts","level":1,"score":0.0},{"id":"https://openalex.org/C126838900","wikidata":"https://www.wikidata.org/wiki/Q77604","display_name":"Radiology","level":1,"score":0.0},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.21437/interspeech.2004-418","is_oa":false,"landing_page_url":"https://doi.org/10.21437/interspeech.2004-418","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Interspeech 2004","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6600000262260437}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W1975270274"],"related_works":["https://openalex.org/W4387426029","https://openalex.org/W2271369634","https://openalex.org/W3147472394","https://openalex.org/W2047100085","https://openalex.org/W2350550760","https://openalex.org/W4254162896","https://openalex.org/W4388792380","https://openalex.org/W578794879","https://openalex.org/W1477999932","https://openalex.org/W4386731653"],"abstract_inverted_index":{"In":[0,100,269,295,317,347,373,392,568],"a":[1,18,182,215,333,365,472,499,526,564,569,653,688,700,723,735,771,788,807],"review":[2,78],"paper":[3,75],"about":[4,47,542],"audio-visual":[5,98,105,118],"(AV)":[6],"fusion":[7,99,147],"models":[8,21,106,485],"in":[9,68,141,148,186,210,214,221,232,242,255,262,330,337,443,479,517,536,559,566,600,627,646,673,699,741,758,798,813],"speech":[10,108,535,716,757,816],"perception,":[11,109],"we":[12,37,446,576,777],"(Schwartz":[13,160,572],"et":[14,161,208,441,467,487,573,826,830],"al.,":[15,162,209,442,468,488,574,827,831],"1998)":[16],"proposed":[17,471],"taxonomy":[19],"of":[20,66,73,104,107,115,124,130,181,192,217,291,313,339,354,367,389,502,504,528,534,603,623,655,695,703,710,774,809,815],"around":[22],"two":[23,172,218],"basic":[24,158],"questions:":[25],"architecture":[26],"and":[27,52,58,81,127,144,251,277,326,370,420,428,470,523,547,593,706,739,760,820],"control.":[28],"Six":[29],"years":[30],"after,":[31],"it":[32,421,686],"appears":[33],"that":[34,154,168,238,282,414,426,451,583,765,776,784,791],"the":[35,86,113,125,189,199,224,227,229,249,263,270,275,286,288,296,301,318,324,340,348,376,382,390,394,410,453,505,532,585,617,633,648,683,692,708,714,727,744,747,769,799],"proposals":[36],"made":[38],"still":[39],"seem":[40,61],"rather":[41],"convenient":[42],"for":[43,97,117,396,463],"discussing":[44],"major":[45],"questions":[46,230],"AV":[48],"fusion.":[49,524],"Moreover":[50],"\u2013":[51,55],"more":[53],"importantly":[54],"recent":[56,482,529,570],"experimental":[57,643],"theoretical":[59],"progress":[60],"to":[62,77,82,175,226,343,406,554,563,579,590,608,615,640,681,755,781,787,805],"provide":[63,806],"some":[64,187,312],"elements":[65],"answer":[67],"both":[69,323],"aspects.":[70],"The":[71,94,135,179,491,495,597,642,675],"aim":[72],"this":[74,233,415,543,557,601],"is":[76,167,283,304,387,417,803],"these":[79,355,624,696],"elements,":[80],"incorporate":[83],"them":[84],"into":[85,306,311],"general":[87,136],"architecture-and-control":[88],"framework.":[89],"1.":[90],"FUSION":[91],"ARCHITECTURES":[92],"1.1.":[93],"four":[95,157,496],"architectures":[96,159,497],"his":[101],"well-known":[102],"presentation":[103],"Summerfield":[110],"(1987)":[111],"introduced":[112],"concept":[114],"\u201cmetrics":[116],"integration\u201d,":[119],"focusing":[120],"on":[121,138,145,364,531,652,734],"\u201cthe":[122],"representations":[123,279],"auditory":[126],"visual":[128,302,676,715],"streams":[129],"information":[131,149,511],"at":[132],"their":[133,256],"conflux\u201d.":[134],"literature":[137],"sensory":[139,257],"interactions":[140,751,797],"cognitive":[142],"psychology,":[143],"sensor":[146],"processing,":[150],"lead":[151,405,607],"us":[152],"conclude":[153],"there":[155,766],"are":[156,240,248,280,329],"1998).":[163],"Their":[164],"common":[165,177,500],"point":[166,790],"they":[169,239,260,613,785],"should":[170,403,792],"connect":[171],"separate":[173,219],"inputs":[174,253,328],"one":[176,408,725],"output.":[178],"conception":[180,216],"single":[183],"output":[184],"\u201cloosing\u201d":[185],"sense":[188],"monosensorial":[190,507],"nature":[191],"each":[193,518],"input":[194,225,303],"may":[195],"be":[196,458,513,793],"discussed":[197],"(see":[198,434],"\u201cconvergence":[200],"vs.":[201,378,384],"association":[202],"\u201d":[203],"debate":[204],"raised":[205,539],"by":[206,285,722],"Bernstein":[207],"press).":[211,444],"However,":[212,445,525,621],"even":[213],"routes":[220],"interaction":[222,522],"from":[223,474,476,795],"output,":[228],"addressed":[231],"section":[234],"remain":[235],"valid,":[236],"provided":[237,687],"rephrased":[241],"terms":[243,338,814],"of:":[244],"under":[245],"what":[246],"format":[247],"A":[250,276,325,369],"V":[252,278,327,371],"represented":[254],"pathway":[258],"when":[259],"interact":[261],"route":[264,494],"towards":[265],"phonology":[266],"or":[267,310,398,409],"lexicon?":[268],"Separate":[271],"Identification":[272,350],"(SI)":[273],"model,":[274,300,322,352],"phonetic,":[281],"mediated":[284],"knowledge":[287],"subject":[289],"has":[290,422],"his/her":[292],"own":[293,551],"language.":[294],"Dominant":[297],"Recoding":[298,320],"(DR)":[299],"recoded":[305],"an":[307,580],"equivalent":[308],"sound,":[309],"its":[314],"spectro-temporal":[315],"characteristics.":[316],"Motor":[319],"(MR)":[321],"contact":[331,789],"with":[332,481,726,743],"system":[334],"analysing":[335],"percepts":[336],"action":[341],"able":[342],"have":[344,447,538],"produced":[345],"them.":[346],"Direct":[349],"(DI)":[351],"none":[353],"process":[356],"occur":[357,753],"before":[358,521],"phonetic":[359],"identification":[360,694,709],"which":[361],"operates":[362],"directly":[363],"set":[366,602,773],"joined":[368],"parameters.":[372],"our":[374],"view,":[375],"static":[377,397],"dynamic":[379,399,464],"issue":[380],"(or":[381],"shape":[383,456],"movement":[385,427],"debate)":[386],"independent":[388],"architecture.":[391,412],"consequence,":[393],"preference":[395],"parameters,":[400],"if":[401,556],"any,":[402],"not":[404,606,679],"select":[407],"other":[411],"Notice":[413],"position":[416],"itself":[418],"controversial,":[419],"been":[423],"often":[424],"argued":[425],"motor":[429],"representation":[430],"were":[431,636],"linked":[432],"topics":[433],"e.g.":[435,662],"Rosenblum":[436],"&":[437],"Saldana,":[438],"1998;":[439],"Whalen":[440],"several":[448],"times":[449],"advocated":[450],"recovering":[452],"vocal":[454],"tract":[455],"could":[457,561,605],"done":[459],"without":[460],"necessarily":[461],"calling":[462],"features":[465],"(Cathiard":[466,486],"1996),":[469],"\u201cshape":[473],"shading":[475],"movement\u201d":[477],"approach":[478],"line":[480],"neurophysiological":[483],"computational":[484],"2003).":[489,834],"1.2.":[490],"\u201cvery":[492,779],"early\u201d":[493,780],"share":[498],"assumption":[501,544],"independence":[503],"primitive":[506],"processing.":[508],"That":[509],"is,":[510,767],"would":[512],"first":[514],"extracted":[515,638],"separately":[516],"sensorial":[519],"channel":[520],"number":[527,654,808],"studies":[530],"detection":[533,560],"noise":[537,628,759],"serious":[540],"doubts":[541],"(since":[545],"Grant":[546],"Seitz,":[548],"2000).":[549],"Our":[550],"contribution":[552],"was":[553,629],"determine":[555],"gain":[558,565],"contribute":[562],"identification.":[567],"study":[571],"2004),":[575],"showed,":[577],"thanks":[578,639],"original":[581],"paradigm,":[582],"seeing":[584],"speaker\u2019s":[586],"lips":[587],"does":[588],"enable":[589,680],"better":[591,595,637],"hear":[592],"hence":[594],"understand.":[596],"stimuli":[598,625,697],"used":[599],"experiments":[604],"lipreading":[609],"per":[610],"se":[611],"since":[612],"corresponded":[614],"exactly":[616],"same":[618,649,728],"lip":[619,650,719],"gesture.":[620],"intelligibility":[622],"merged":[626],"improved":[630],"just":[631],"because":[632],"acoustic":[634],"cues":[635],"vision.":[641],"trick":[644],"consisted":[645],"dubbing":[647],"gesture":[651],"visually":[656],"similar":[657],"but":[658,685],"auditorily":[659],"different":[660],"configurations,":[661],"[y":[663],"u":[664],"ty":[665],"tu":[666],"ky":[667],"ku":[668],"dy":[669],"du":[670],"gy":[671],"gu]":[672],"French.":[674],"stimulus":[677],"did":[678],"identify":[682],"syllable,":[684],"temporal":[689,729],"cue":[690,717],"improving":[691],"audio":[693],"embedded":[698],"large":[701],"level":[702],"cocktail-party":[704],"noise,":[705],"particularly":[707],"plosive":[711],"voicing.":[712],"Replacing":[713],"(the":[718],"rounding":[720],"gesture)":[721],"non-speech":[724],"pattern":[730],"(a":[731],"red":[732],"bar":[733],"black":[736],"background,":[737],"increasing":[738],"decreasing":[740],"synchrony":[742],"lips)":[745],"removed":[746],"benefit.":[748],"Therefore,":[749],"cross-modal":[750],"can":[752],"early":[754,796],"enhance":[756],"improve":[761],"intelligibility.":[762],"This":[763,802],"indicates":[764],"whatever":[768],"architecture,":[770],"preliminary":[772],"interactions,":[775],"called":[778],"make":[782],"clear":[783],"correspond":[786],"distinguished":[794],"classical":[800],"sense.":[801],"likely":[804],"interesting":[810],"technological":[811],"counterparts":[812],"enhancement,":[817],"source":[818],"separation":[819],"audiovisual":[821],"scene":[822],"analysis":[823],"(e.g.":[824],"Girin":[825],"2001;":[828],"Sodoyer":[829],"2002;":[832],"Berthommier,":[833]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
