{"id":"https://openalex.org/W4416513039","doi":"https://doi.org/10.1109/access.2025.3636123","title":"EmoBridge: Aligning Speech and Language for Emotion Recognition via Q-Former","display_name":"EmoBridge: Aligning Speech and Language for Emotion Recognition via Q-Former","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416513039","doi":"https://doi.org/10.1109/access.2025.3636123"},"language":null,"primary_location":{"id":"doi:10.1109/access.2025.3636123","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3636123","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2025.3636123","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5113743545","display_name":"Yuntao Sun","orcid":"https://orcid.org/0009-0009-6348-8238"},"institutions":[{"id":"https://openalex.org/I4210154121","display_name":"Shandong Institute of Commerce & Technology","ror":"https://ror.org/03xk2yz39","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210154121"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuntao Sun","raw_affiliation_strings":["Shandong College of Electronic Technology, Jinan, China","Shandong College of Electronic Technology, China"],"raw_orcid":"https://orcid.org/0009-0009-6348-8238","affiliations":[{"raw_affiliation_string":"Shandong College of Electronic Technology, Jinan, China","institution_ids":["https://openalex.org/I4210154121"]},{"raw_affiliation_string":"Shandong College of Electronic Technology, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106615504","display_name":"Xin Zhang","orcid":"https://orcid.org/0009-0008-2861-633X"},"institutions":[{"id":"https://openalex.org/I4210154121","display_name":"Shandong Institute of Commerce & Technology","ror":"https://ror.org/03xk2yz39","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210154121"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xuehua Zhang","raw_affiliation_strings":["Shandong College of Electronic Technology, Jinan, China","Shandong College of Electronic Technology, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shandong College of Electronic Technology, Jinan, China","institution_ids":["https://openalex.org/I4210154121"]},{"raw_affiliation_string":"Shandong College of Electronic Technology, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100596664","display_name":"Yanting Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I4210154121","display_name":"Shandong Institute of Commerce & Technology","ror":"https://ror.org/03xk2yz39","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210154121"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanting Sun","raw_affiliation_strings":["Shandong College of Electronic Technology, Jinan, China","Shandong College of Electronic Technology, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Shandong College of Electronic Technology, Jinan, China","institution_ids":["https://openalex.org/I4210154121"]},{"raw_affiliation_string":"Shandong College of Electronic Technology, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33352843,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"205601","last_page":"205611"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10664","display_name":"Sentiment Analysis and Opinion Mining","score":0.0008999999845400453,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.54339998960495},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5123999714851379},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.4796000123023987},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.43130001425743103},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.39469999074935913},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.3912999927997589},{"id":"https://openalex.org/keywords/mechanism","display_name":"Mechanism (biology)","score":0.3702999949455261},{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.3668000102043152}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.838100016117096},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.54339998960495},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5358999967575073},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5123999714851379},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.4796000123023987},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.43130001425743103},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.39469999074935913},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.3912999927997589},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.384799987077713},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.37689998745918274},{"id":"https://openalex.org/C89611455","wikidata":"https://www.wikidata.org/wiki/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3702999949455261},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.3668000102043152},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C206310091","wikidata":"https://www.wikidata.org/wiki/Q750859","display_name":"Emotion classification","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3203999996185303},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3140000104904175},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.29580000042915344},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.2953999936580658},{"id":"https://openalex.org/C6438553","wikidata":"https://www.wikidata.org/wiki/Q1185804","display_name":"Affective computing","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C28006648","wikidata":"https://www.wikidata.org/wiki/Q6934509","display_name":"Multi-task learning","level":3,"score":0.2745000123977661},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2596000134944916},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/access.2025.3636123","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3636123","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/access.2025.3636123","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3636123","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W2146334809","https://openalex.org/W2584561145","https://openalex.org/W2619383789","https://openalex.org/W2787581402","https://openalex.org/W2964010806","https://openalex.org/W2964051877","https://openalex.org/W2965373594","https://openalex.org/W3209059054","https://openalex.org/W4221162872","https://openalex.org/W4313887688","https://openalex.org/W4375869114","https://openalex.org/W4377292302","https://openalex.org/W4384207713","https://openalex.org/W4385822811","https://openalex.org/W4385823311","https://openalex.org/W4385823394","https://openalex.org/W4386072325","https://openalex.org/W4388994251","https://openalex.org/W4389474295","https://openalex.org/W4389524500","https://openalex.org/W4389752793","https://openalex.org/W4391307921","https://openalex.org/W4392172995","https://openalex.org/W4392903514","https://openalex.org/W4392903618","https://openalex.org/W4400527853","https://openalex.org/W4402112107","https://openalex.org/W4402112352","https://openalex.org/W4402112382","https://openalex.org/W4402115910","https://openalex.org/W4402343736","https://openalex.org/W4404356490","https://openalex.org/W4405981067","https://openalex.org/W4407637543","https://openalex.org/W4408352120","https://openalex.org/W4408355820","https://openalex.org/W4415709309"],"related_works":[],"abstract_inverted_index":{"Speech":[0],"emotion":[1,47,116,137],"recognition":[2],"(SER)":[3],"remains":[4],"a":[5,61,69,74,103,124,148,163,196],"challenging":[6],"task":[7],"due":[8],"to":[9,85,93,139,154,210],"the":[10,18,82,131,159,174,208],"limited":[11],"affective":[12,39,64,169],"cues":[13],"in":[14,24,38,162,187],"unimodal":[15],"representations":[16],"and":[17,49,118,122,135,143,176,190,198,206],"difficulty":[19],"of":[20,133],"aligning":[21],"heterogeneous":[22],"features":[23],"multimodal":[25,28,63,156,203],"systems.":[26],"Although":[27],"large":[29,70],"language":[30,71,160],"models":[31,186],"(MLLMs)":[32],"have":[33],"recently":[34],"achieved":[35],"notable":[36],"progress":[37],"understanding,":[40],"they":[41],"still":[42],"suffer":[43],"from":[44],"hallucination,":[45],"weak":[46],"discrimination,":[48],"instability":[50],"during":[51],"optimization.":[52],"To":[53],"address":[54],"these":[55],"challenges,":[56],"this":[57],"paper":[58],"presents":[59],"EmoBridge,":[60],"novel":[62],"learning":[65,205],"framework":[66,194],"that":[67,180],"integrates":[68],"model":[72,86,161],"with":[73],"speech-aware":[75],"cross-modal":[76,109],"encoder,":[77],"termed":[78],"EmoBridge-Former,":[79],"which":[80,107,128],"extends":[81],"Q-Former":[83],"architecture":[84],"fine-grained":[87],"audio\u2013text":[88],"emotional":[89],"dependencies.":[90],"In":[91,146],"contrast":[92],"existing":[94],"methods,":[95],"EmoBridge":[96,181],"introduces":[97],"two":[98],"improved":[99],"optimization":[100],"objectives:":[101],"(1)":[102],"Prototypical":[104],"InfoNCE":[105],"loss,":[106],"enhances":[108],"alignment":[110],"by":[111],"pulling":[112],"samples":[113,138],"toward":[114],"their":[115],"prototypes":[117],"enlarging":[119],"inter-class":[120],"separability;":[121],"(2)":[123],"Dynamic":[125],"Focal":[126],"Loss,":[127],"adaptively":[129],"adjusts":[130],"weighting":[132],"hard":[134],"easy":[136],"mitigate":[140],"class":[141],"imbalance":[142],"gradient":[144],"domination.":[145],"addition,":[147],"soft-prompt":[149],"fusion":[150],"mechanism":[151],"is":[152],"employed":[153],"inject":[155],"embeddings":[157],"into":[158],"parameter-efficient":[164],"manner,":[165],"thereby":[166],"enabling":[167],"end-to-end":[168],"reasoning.":[170],"Extensive":[171],"experiments":[172],"on":[173],"IEMOCAP":[175],"MELD":[177],"benchmarks":[178],"demonstrate":[179],"consistently":[182],"outperforms":[183],"state-of-the-art":[184],"SER":[185],"both":[188],"accuracy":[189],"robustness.":[191],"The":[192],"proposed":[193],"provides":[195],"unified":[197],"scalable":[199],"paradigm":[200],"for":[201],"emotion-aware":[202],"representation":[204],"has":[207],"potential":[209],"advance":[211],"real-world":[212],"speech":[213],"understanding":[214],"applications.":[215]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-23T00:00:00"}
