{"id":"https://openalex.org/W2092776130","doi":"https://doi.org/10.1109/slt.2012.6424213","title":"Audio-visual feature integration based on piecewise linear transformation for noise robust automatic speech recognition","display_name":"Audio-visual feature integration based on piecewise linear transformation for noise robust automatic speech recognition","publication_year":2012,"publication_date":"2012-12-01","ids":{"openalex":"https://openalex.org/W2092776130","doi":"https://doi.org/10.1109/slt.2012.6424213","mag":"2092776130"},"language":"en","primary_location":{"id":"doi:10.1109/slt.2012.6424213","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt.2012.6424213","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2012 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5109644502","display_name":"Yosuke Kashiwagi","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yosuke Kashiwagi","raw_affiliation_strings":["Graduate School of Information Science and Technology, University of Tokyo, Japan","Graduate School of Information Science and Technology"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate School of Information Science and Technology","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028421346","display_name":"Masayuki Suzuki","orcid":"https://orcid.org/0000-0002-0436-1490"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Masayuki Suzuki","raw_affiliation_strings":["Graduate School of Engineering, University of Tokyo, Japan","Graduate school of Engineering, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate school of Engineering, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041213266","display_name":"Nobuaki Minematsu","orcid":"https://orcid.org/0000-0002-8778-9555"},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Nobuaki Minematsu","raw_affiliation_strings":["Graduate School of Engineering, University of Tokyo, Japan","Graduate school of Engineering, The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"Graduate School of Engineering, University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate school of Engineering, The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108197272","display_name":"Keikichi Hirose","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Keikichi Hirose","raw_affiliation_strings":["Graduate School of Information Science and Technology, University of Tokyo, Japan","Graduate School of Information Science and Technology"],"affiliations":[{"raw_affiliation_string":"Graduate School of Information Science and Technology, University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]},{"raw_affiliation_string":"Graduate School of Information Science and Technology","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5109644502"],"corresponding_institution_ids":["https://openalex.org/I74801974"],"apc_list":null,"apc_paid":null,"fwci":0.7432,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.74919452,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"149","last_page":"152"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7887939214706421},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.7107976675033569},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6781864166259766},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.5412437915802002},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5308958888053894},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5229388475418091},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5160388946533203},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.4749733805656433},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.43816035985946655},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3710958957672119},{"id":"https://openalex.org/keywords/acoustic-model","display_name":"Acoustic model","score":0.2623775601387024},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.12220478057861328}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7887939214706421},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7107976675033569},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6781864166259766},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.5412437915802002},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5308958888053894},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5229388475418091},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5160388946533203},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.4749733805656433},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.43816035985946655},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3710958957672119},{"id":"https://openalex.org/C155635449","wikidata":"https://www.wikidata.org/wiki/Q4674699","display_name":"Acoustic model","level":3,"score":0.2623775601387024},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.12220478057861328},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/slt.2012.6424213","is_oa":false,"landing_page_url":"https://doi.org/10.1109/slt.2012.6424213","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2012 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.376.5866","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.376.5866","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.gavo.t.u-tokyo.ac.jp/~mine/paper/PDF/2012/SLT_p149-152_t2012-12.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7300000190734863}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":19,"referenced_works":["https://openalex.org/W82246966","https://openalex.org/W96504731","https://openalex.org/W1140471238","https://openalex.org/W1929897159","https://openalex.org/W2096391593","https://openalex.org/W2096703356","https://openalex.org/W2098693229","https://openalex.org/W2102863932","https://openalex.org/W2116529913","https://openalex.org/W2121486117","https://openalex.org/W2172803778","https://openalex.org/W2182517799","https://openalex.org/W2397904770","https://openalex.org/W4285719527","https://openalex.org/W6603377551","https://openalex.org/W6603922934","https://openalex.org/W6675533774","https://openalex.org/W6677131165","https://openalex.org/W6712444147"],"related_works":["https://openalex.org/W3159882232","https://openalex.org/W4241650944","https://openalex.org/W4243125559","https://openalex.org/W4241778367","https://openalex.org/W2752054555","https://openalex.org/W2527828870","https://openalex.org/W2184127972","https://openalex.org/W4238262908","https://openalex.org/W4214489098","https://openalex.org/W46679383"],"abstract_inverted_index":{"Multimodal":[0,24],"speech":[1,12,166],"recognition":[2,13,167],"is":[3,16,74,103,180],"a":[4,119,146,187],"promising":[5],"approach":[6],"to":[7,34,45,55,86,96,131,171,186],"realize":[8],"noise":[9,47,88,101],"robust":[10],"automatic":[11],"(ASR),":[14],"and":[15,71,79,90,105,160,173],"currently":[17],"gathering":[18],"the":[19,66,87,100,142],"attention":[20],"of":[21,69,77,148],"many":[22],"researchers.":[23],"ASR":[25],"utilizes":[26],"not":[27],"only":[28],"audio":[29,70,78],"features,":[30,58],"which":[31,153],"are":[32,60,168],"sensitive":[33],"background":[35],"noises,":[36],"but":[37,108],"also":[38],"non-audio":[39],"features":[40,73,81,89,98],"such":[41],"as":[42,184],"lip":[43],"shapes":[44],"achieve":[46],"robustness.":[48],"Although":[49],"various":[50],"methods":[51],"have":[52],"been":[53],"proposed":[54,137],"integrate":[56],"audio-visual":[57],"there":[59],"still":[61],"continuing":[62],"discussions":[63],"on":[64,122,145,157,164],"how":[65,109],"vest":[67],"integration":[68,134],"visual":[72,80,97],"realized.":[75],"Weights":[76],"should":[82],"be":[83,112],"decided":[84],"according":[85],"levels:":[91],"in":[92,126,182],"general,":[93],"larger":[94],"weights":[95],"when":[99],"level":[102],"low":[104],"vice":[106],"versa,":[107],"it":[110],"can":[111,139],"controlled?":[113],"In":[114,129],"this":[115],"paper,":[116],"we":[117],"propose":[118],"method":[120,138],"based":[121],"piecewise":[123],"linear":[124],"transformation":[125],"feature":[127,133],"integration.":[128],"contrast":[130],"other":[132],"methods,":[135],"our":[136],"appropriately":[140],"change":[141],"weight":[143],"depending":[144],"state":[147],"an":[149],"observed":[150],"noisy":[151,165],"feature,":[152],"has":[154],"information":[155],"both":[156],"uttered":[158],"phonemes":[159],"environmental":[161],"noise.":[162],"Experiments":[163],"conducted":[169],"following":[170],"CENSREC-1-AV,":[172],"word":[174],"error":[175],"reduction":[176],"rate":[177],"around":[178],"24%":[179],"realized":[181],"average":[183],"compared":[185],"decision":[188],"fusion":[189],"method.":[190]},"counts_by_year":[{"year":2017,"cited_by_count":2},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":2}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
