{"id":"https://openalex.org/W4379116631","doi":"https://doi.org/10.1109/tmm.2023.3282488","title":"Explainability of Speech Recognition Transformers via Gradient-Based Attention Visualization","display_name":"Explainability of Speech Recognition Transformers via Gradient-Based Attention Visualization","publication_year":2023,"publication_date":"2023-06-02","ids":{"openalex":"https://openalex.org/W4379116631","doi":"https://doi.org/10.1109/tmm.2023.3282488"},"language":"en","primary_location":{"id":"doi:10.1109/tmm.2023.3282488","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2023.3282488","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014852037","display_name":"Tianli Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Tianli Sun","raw_affiliation_strings":["School of Electronic and Information Engineering, Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100767769","display_name":"Haonan Chen","orcid":"https://orcid.org/0000-0001-8885-5051"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haonan Chen","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075333422","display_name":"Guosheng Hu","orcid":"https://orcid.org/0000-0002-9448-9892"},"institutions":[{"id":"https://openalex.org/I4210116582","display_name":"Quadient (United Kingdom)","ror":"https://ror.org/01p5m7j86","country_code":"GB","type":"company","lineage":["https://openalex.org/I4210116582","https://openalex.org/I4210124977"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Guosheng Hu","raw_affiliation_strings":["Oosto, London, U.K"],"affiliations":[{"raw_affiliation_string":"Oosto, London, U.K","institution_ids":["https://openalex.org/I4210116582"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091130234","display_name":"Lianghua He","orcid":"https://orcid.org/0000-0002-5250-170X"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lianghua He","raw_affiliation_strings":["School of Electronic and Information Engineering, Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5064237960","display_name":"Cairong Zhao","orcid":"https://orcid.org/0000-0001-6745-9674"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cairong Zhao","raw_affiliation_strings":["School of Electronic and Information Engineering, Tongji University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic and Information Engineering, Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5014852037"],"corresponding_institution_ids":["https://openalex.org/I116953780"],"apc_list":null,"apc_paid":null,"fwci":1.0332,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.80736818,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":"26","issue":null,"first_page":"1395","last_page":"1406"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9876999855041504,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8413988947868347},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.7801387310028076},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7550941705703735},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5275715589523315},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.5011096000671387},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.4910557270050049},{"id":"https://openalex.org/keywords/homophone","display_name":"Homophone","score":0.4298766851425171},{"id":"https://openalex.org/keywords/hidden-markov-model","display_name":"Hidden Markov model","score":0.42744946479797363},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.4194405674934387},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3862033486366272}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8413988947868347},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.7801387310028076},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7550941705703735},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5275715589523315},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.5011096000671387},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.4910557270050049},{"id":"https://openalex.org/C160253069","wikidata":"https://www.wikidata.org/wiki/Q221079","display_name":"Homophone","level":2,"score":0.4298766851425171},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.42744946479797363},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.4194405674934387},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3862033486366272},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2023.3282488","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2023.3282488","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.6600000262260437,"display_name":"Quality Education"}],"awards":[{"id":"https://openalex.org/G1733004084","display_name":null,"funder_award_id":"61673299","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G23458378","display_name":null,"funder_award_id":"61976160","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3308686228","display_name":null,"funder_award_id":"22ZR1466700","funder_id":"https://openalex.org/F4320309612","funder_display_name":"Natural Science Foundation of Shanghai"},{"id":"https://openalex.org/G5036809106","display_name":null,"funder_award_id":"62076182","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5686205310","display_name":null,"funder_award_id":"62076184","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320309612","display_name":"Natural Science Foundation of Shanghai","ror":null},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":33,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W1787224781","https://openalex.org/W1932968309","https://openalex.org/W2127141656","https://openalex.org/W2195388612","https://openalex.org/W2295107390","https://openalex.org/W2600144439","https://openalex.org/W2766219058","https://openalex.org/W2892009249","https://openalex.org/W2936774411","https://openalex.org/W2946794439","https://openalex.org/W2962778134","https://openalex.org/W2962858109","https://openalex.org/W2963242190","https://openalex.org/W2963250244","https://openalex.org/W2964110616","https://openalex.org/W2970726176","https://openalex.org/W3015412285","https://openalex.org/W3015537910","https://openalex.org/W3016010032","https://openalex.org/W3035422918","https://openalex.org/W3091150073","https://openalex.org/W3095173472","https://openalex.org/W3097777922","https://openalex.org/W3176196997","https://openalex.org/W3192884235","https://openalex.org/W3197478142","https://openalex.org/W4230405732","https://openalex.org/W4312820606","https://openalex.org/W6739901393","https://openalex.org/W6764072591","https://openalex.org/W6771467084","https://openalex.org/W6784333009"],"related_works":["https://openalex.org/W2358775429","https://openalex.org/W1567440624","https://openalex.org/W2364211260","https://openalex.org/W1977345729","https://openalex.org/W2105439214","https://openalex.org/W2079548008","https://openalex.org/W2030399736","https://openalex.org/W1985374210","https://openalex.org/W4245032792","https://openalex.org/W2371388421"],"abstract_inverted_index":{"In":[0,78,149],"vision":[1,57],"Transformers,":[2],"attention":[3,70,85,126,140,147,164,180,203],"visualization":[4,71,86,127,156],"methods":[5,72],"are":[6],"used":[7],"to":[8,51,93,167,173,230,238],"generate":[9],"heatmaps":[10],"highlighting":[11],"the":[12,23,65,98,102,107,116,119,139,153,186,192,202,209,214,227],"class-corresponding":[13],"areas":[14],"in":[15,75,88,101,158],"input":[16,49,66],"images,":[17],"which":[18,95,137,183],"offers":[19],"explanations":[20],"on":[21,204],"how":[22,226],"models":[24],"make":[25],"predictions.":[26],"However,":[27],"it":[28],"is":[29,131],"not":[30],"so":[31],"applicable":[32],"for":[33,47,64],"explaining":[34],"automatic":[35],"speech":[36],"recognition":[37],"(ASR)":[38],"Transformers.":[39,77],"An":[40],"ASR":[41,76,89,120,176],"Transformer":[42,58,125,146],"makes":[43,60],"a":[44,53,56,83],"particular":[45],"prediction":[46],"every":[48],"token":[50],"form":[52],"sentence,":[54],"but":[55],"only":[59],"an":[61,175],"overall":[62],"classification":[63,170],"data.":[67],"Therefore,":[68],"traditional":[69],"may":[73],"fail":[74],"this":[79],"work,":[80],"we":[81,110,151,224],"propose":[82],"novel":[84],"method":[87,130],"Transformers":[90],"and":[91,134,194,216],"try":[92],"explain":[94],"frames":[96],"of":[97,114,118,145,155,191],"audio":[99],"result":[100,157],"output":[103],"text.":[104],"Inspired":[105],"by":[106,211],"model":[108,177,193,210,228],"explainability,":[109],"also":[111],"explore":[112],"ways":[113],"improving":[115],"effectiveness":[117],"model.":[121],"Comparing":[122],"with":[123,165,178,236],"other":[124],"methods,":[128],"our":[129],"more":[132],"efficient":[133],"intuitively":[135],"understandable,":[136],"unravels":[138],"calculation":[141],"from":[142],"information":[143],"flow":[144],"modules.":[148],"addition,":[150],"demonstrate":[152],"utilization":[154],"three":[159],"ways:":[160],"(1)":[161],"We":[162,200],"visualize":[163,201],"respect":[166,237],"connectionist":[168],"temporal":[169],"(CTC)":[171],"loss":[172],"train":[174],"adversarial":[179],"erasing":[181],"regularization,":[182],"effectively":[184,212],"decreases":[185],"word":[187],"error":[188],"rate":[189],"(WER)":[190],"improves":[195],"its":[196],"generalization":[197],"capability.":[198],"(2)":[199],"some":[205],"specific":[206],"words,":[207],"interpreting":[208],"demonstrating":[213],"semantic":[215],"grammar":[217],"relationships":[218],"between":[219],"these":[220],"words.":[221],"(3)":[222],"Similarly,":[223],"analyze":[225],"manage":[229],"distinguish":[231],"homophones,":[232],"using":[233],"contrastive":[234],"explanation":[235],"homophones.":[239]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-14T08:04:32.555800","created_date":"2025-10-10T00:00:00"}
