{"id":"https://openalex.org/W4327852215","doi":"https://doi.org/10.1109/taffc.2023.3258900","title":"Applying Segment-Level Attention on Bi-Modal Transformer Encoder for Audio-Visual Emotion Recognition","display_name":"Applying Segment-Level Attention on Bi-Modal Transformer Encoder for Audio-Visual Emotion Recognition","publication_year":2023,"publication_date":"2023-03-17","ids":{"openalex":"https://openalex.org/W4327852215","doi":"https://doi.org/10.1109/taffc.2023.3258900"},"language":"en","primary_location":{"id":"doi:10.1109/taffc.2023.3258900","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2023.3258900","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5019460894","display_name":"Jia-Hao Hsu","orcid":"https://orcid.org/0009-0008-5548-2509"},"institutions":[{"id":"https://openalex.org/I91807558","display_name":"National Cheng Kung University","ror":"https://ror.org/01b8kcc49","country_code":"TW","type":"education","lineage":["https://openalex.org/I91807558"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Jia-Hao Hsu","raw_affiliation_strings":["Graduate Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan"],"raw_orcid":"https://orcid.org/0009-0008-5548-2509","affiliations":[{"raw_affiliation_string":"Graduate Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan","institution_ids":["https://openalex.org/I91807558"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103251327","display_name":"Chung\u2010Hsien Wu","orcid":"https://orcid.org/0000-0002-3947-2123"},"institutions":[{"id":"https://openalex.org/I91807558","display_name":"National Cheng Kung University","ror":"https://ror.org/01b8kcc49","country_code":"TW","type":"education","lineage":["https://openalex.org/I91807558"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chung-Hsien Wu","raw_affiliation_strings":["Department of Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan"],"raw_orcid":"https://orcid.org/0000-0002-3947-2123","affiliations":[{"raw_affiliation_string":"Department of Computer Science and Information Engineering, National Cheng Kung University, Tainan, Taiwan","institution_ids":["https://openalex.org/I91807558"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":7.7981,"has_fulltext":false,"cited_by_count":33,"citation_normalized_percentile":{"value":0.97917214,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":"14","issue":"4","first_page":"3231","last_page":"3243"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9196000099182129,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6844302415847778},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6362612843513489},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.5935851335525513},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.5501542687416077},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5475179553031921},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4797990620136261},{"id":"https://openalex.org/keywords/facial-expression","display_name":"Facial expression","score":0.46006691455841064},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.4580376148223877}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6844302415847778},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6362612843513489},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.5935851335525513},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.5501542687416077},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5475179553031921},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4797990620136261},{"id":"https://openalex.org/C195704467","wikidata":"https://www.wikidata.org/wiki/Q327968","display_name":"Facial expression","level":2,"score":0.46006691455841064},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.4580376148223877},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/taffc.2023.3258900","is_oa":false,"landing_page_url":"https://doi.org/10.1109/taffc.2023.3258900","pdf_url":null,"source":{"id":"https://openalex.org/S104780363","display_name":"IEEE Transactions on Affective Computing","issn_l":"1949-3045","issn":["1949-3045","2371-9850"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Affective Computing","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.5400000214576721}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":73,"referenced_works":["https://openalex.org/W207695133","https://openalex.org/W1686810756","https://openalex.org/W1766395648","https://openalex.org/W1963552095","https://openalex.org/W1971063881","https://openalex.org/W1981918162","https://openalex.org/W1992810401","https://openalex.org/W2037960784","https://openalex.org/W2038821742","https://openalex.org/W2045528981","https://openalex.org/W2085662862","https://openalex.org/W2105020570","https://openalex.org/W2146334809","https://openalex.org/W2150706391","https://openalex.org/W2153635508","https://openalex.org/W2155214670","https://openalex.org/W2156503193","https://openalex.org/W2159017231","https://openalex.org/W2168692779","https://openalex.org/W2314395941","https://openalex.org/W2329093554","https://openalex.org/W2336160298","https://openalex.org/W2564768500","https://openalex.org/W2585658440","https://openalex.org/W2592183475","https://openalex.org/W2738561771","https://openalex.org/W2767618761","https://openalex.org/W2774147932","https://openalex.org/W2786779322","https://openalex.org/W2805662932","https://openalex.org/W2883409523","https://openalex.org/W2889466822","https://openalex.org/W2899197626","https://openalex.org/W2904483377","https://openalex.org/W2907891676","https://openalex.org/W2921151609","https://openalex.org/W2938004196","https://openalex.org/W2962736520","https://openalex.org/W2963252191","https://openalex.org/W2964010806","https://openalex.org/W2964300796","https://openalex.org/W2971395480","https://openalex.org/W2976982466","https://openalex.org/W2996906606","https://openalex.org/W3007282427","https://openalex.org/W3008130537","https://openalex.org/W3015558147","https://openalex.org/W3036601975","https://openalex.org/W3045969489","https://openalex.org/W3093521632","https://openalex.org/W3097075955","https://openalex.org/W3103205982","https://openalex.org/W3127521573","https://openalex.org/W3137028092","https://openalex.org/W3143635792","https://openalex.org/W3158929093","https://openalex.org/W3197642003","https://openalex.org/W3211224152","https://openalex.org/W4205730971","https://openalex.org/W4205781455","https://openalex.org/W4206204855","https://openalex.org/W4224916778","https://openalex.org/W4225691633","https://openalex.org/W4225694800","https://openalex.org/W4226383576","https://openalex.org/W4230471314","https://openalex.org/W4299586661","https://openalex.org/W6637373629","https://openalex.org/W6638638512","https://openalex.org/W6696852441","https://openalex.org/W6780218876","https://openalex.org/W6803378298","https://openalex.org/W7066990553"],"related_works":["https://openalex.org/W2185469136","https://openalex.org/W4306353150","https://openalex.org/W2026860389","https://openalex.org/W2168054807","https://openalex.org/W2058990474","https://openalex.org/W2043363698","https://openalex.org/W3207883763","https://openalex.org/W4386977688","https://openalex.org/W2383394264","https://openalex.org/W3216879894"],"abstract_inverted_index":{"Emotions":[0],"can":[1],"be":[2],"expressed":[3],"through":[4],"multiple":[5],"complementary":[6],"modalities.":[7],"This":[8],"study":[9],"selected":[10],"speech":[11],"and":[12,84,88,118,174,197],"facial":[13],"expressions":[14],"as":[15],"modalities":[16,103],"by":[17],"which":[18,54,123,168,191],"to":[19,37,61,90,96,131],"recognize":[20],"emotions.":[21],"Current":[22],"audiovisual":[23,80],"emotion":[24,65,120,135,140,165,187],"recognition":[25,66,136],"models":[26,34],"perform":[27],"supervised":[28,47],"learning":[29,48],"using":[30,104,116],"signal-level":[31,59,117],"inputs.":[32],"Such":[33],"are":[35,55],"presumed":[36],"characterize":[38],"the":[39,97,101,129,142,147,156,163,180,185],"temporal":[40,124],"relationships":[41],"in":[42,122],"signals.":[43],"In":[44,74,138],"this":[45,75],"study,":[46,76],"was":[49,114,126],"performed":[50],"on":[51,162,171,184],"segment-level":[52,92,119],"signals,":[53,60],"more":[56],"granular":[57],"than":[58,155,179],"precisely":[62],"train":[63],"an":[64],"model.":[67],"Effectively":[68],"fusing":[69,159],"multimodal":[70,186],"signals":[71,81,130],"is":[72,169,192],"challenging.":[73],"sequential":[77],"segments":[78],"of":[79,100,158,194],"were":[82,86],"obtained,":[83],"features":[85],"extracted":[87],"applied":[89],"estimate":[91],"attention":[93],"weights":[94],"according":[95],"emotional":[98],"consistency":[99],"two":[102],"a":[105],"neural":[106],"tensor":[107],"network.":[108],"A":[109],"proposed":[110,148],"bimodal":[111,139],"Transformer":[112,182],"Encoder":[113],"trained":[115],"labels":[121],"context":[125],"incorporated":[127],"into":[128],"improve":[132],"upon":[133],"existing":[134],"models.":[137],"recognition,":[141],"experimental":[143],"results":[144],"demonstrated":[145],"that":[146],"method":[149,157],"achieved":[150],"74.31%":[151],"accuracy":[152,176],"(3.05%":[153],"higher":[154,178],"correlation":[160],"features)":[161],"audio-visual":[164],"dataset":[166],"BAUM-1,":[167],"based":[170],"fivefold":[172],"cross-validation,":[173],"76.81%":[175],"(2.57%":[177],"Multimodal":[181],"Encoder)":[183],"data":[188],"set":[189],"CMU-MOSEI,":[190],"composed":[193],"training,":[195],"validation,":[196],"testing":[198],"sets.":[199]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":12},{"year":2023,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
