{"id":"https://openalex.org/W4408354220","doi":"https://doi.org/10.1109/icassp49660.2025.10890415","title":"Efficient Fusion of Computationally Diverse Modalities Using Chunking and Cross-Attention","display_name":"Efficient Fusion of Computationally Diverse Modalities Using Chunking and Cross-Attention","publication_year":2025,"publication_date":"2025-03-12","ids":{"openalex":"https://openalex.org/W4408354220","doi":"https://doi.org/10.1109/icassp49660.2025.10890415"},"language":"en","primary_location":{"id":"doi:10.1109/icassp49660.2025.10890415","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890415","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027418162","display_name":"Christian Flores","orcid":"https://orcid.org/0000-0001-8301-5598"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Christian Flores","raw_affiliation_strings":["The University of Texas at Dallas,Richardson,TX,USA,75080"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Dallas,Richardson,TX,USA,75080","institution_ids":["https://openalex.org/I162577319"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017966455","display_name":"Lucas Goncalves","orcid":"https://orcid.org/0000-0001-9613-1002"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lucas Goncalves","raw_affiliation_strings":["The University of Texas at Dallas,Richardson,TX,USA,75080"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Dallas,Richardson,TX,USA,75080","institution_ids":["https://openalex.org/I162577319"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040793194","display_name":"Carlos Busso","orcid":"https://orcid.org/0000-0002-4075-4072"},"institutions":[{"id":"https://openalex.org/I162577319","display_name":"The University of Texas at Dallas","ror":"https://ror.org/049emcs32","country_code":"US","type":"education","lineage":["https://openalex.org/I162577319"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Carlos Busso","raw_affiliation_strings":["The University of Texas at Dallas,Richardson,TX,USA,75080"],"affiliations":[{"raw_affiliation_string":"The University of Texas at Dallas,Richardson,TX,USA,75080","institution_ids":["https://openalex.org/I162577319"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5027418162"],"corresponding_institution_ids":["https://openalex.org/I162577319"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01912959,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.8810999989509583,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.8810999989509583,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.8474000096321106,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.8447999954223633,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.813910961151123},{"id":"https://openalex.org/keywords/chunking","display_name":"Chunking (psychology)","score":0.7744481563568115},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6813825368881226},{"id":"https://openalex.org/keywords/fusion","display_name":"Fusion","score":0.5089962482452393},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4863288998603821},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3777385950088501}],"concepts":[{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.813910961151123},{"id":"https://openalex.org/C203357204","wikidata":"https://www.wikidata.org/wiki/Q1089605","display_name":"Chunking (psychology)","level":2,"score":0.7744481563568115},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6813825368881226},{"id":"https://openalex.org/C158525013","wikidata":"https://www.wikidata.org/wiki/Q2593739","display_name":"Fusion","level":2,"score":0.5089962482452393},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4863288998603821},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3777385950088501},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icassp49660.2025.10890415","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp49660.2025.10890415","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W2030931454","https://openalex.org/W2886564777","https://openalex.org/W2897067191","https://openalex.org/W2964051877","https://openalex.org/W2979826702","https://openalex.org/W3024979138","https://openalex.org/W3035299099","https://openalex.org/W3095118468","https://openalex.org/W3164582967","https://openalex.org/W3209984917","https://openalex.org/W4214612132","https://openalex.org/W4312976151","https://openalex.org/W4375869346","https://openalex.org/W4385245566","https://openalex.org/W4386076638","https://openalex.org/W4387421359","https://openalex.org/W4400975545","https://openalex.org/W4402671293","https://openalex.org/W6631190155","https://openalex.org/W6757817989","https://openalex.org/W6771626834","https://openalex.org/W6779163297","https://openalex.org/W6779709467","https://openalex.org/W6783944145","https://openalex.org/W6790307280","https://openalex.org/W6796581206","https://openalex.org/W6843069852","https://openalex.org/W6850957775","https://openalex.org/W6859619714"],"related_works":["https://openalex.org/W2961085424","https://openalex.org/W4306674287","https://openalex.org/W4387369504","https://openalex.org/W3046775127","https://openalex.org/W4394896187","https://openalex.org/W3170094116","https://openalex.org/W4386462264","https://openalex.org/W3107602296","https://openalex.org/W4364306694","https://openalex.org/W4312192474"],"abstract_inverted_index":{"Emotion":[0],"recognition":[1,148],"is":[2,42,111],"inherently":[3],"a":[4,16,43,51,89,145],"multimodal":[5,95],"problem.":[6],"Humans":[7],"use":[8,28],"both":[9],"audible":[10],"and":[11,32,118,155],"visual":[12,33,103,119],"cues":[13],"to":[14,29,60],"determine":[15],"person\u2019s":[17],"emotions.":[18],"There":[19],"has":[20],"been":[21],"extensive":[22],"improvement":[23],"in":[24,53,140],"the":[25,54,62,69,84,102,115,123,128,141,158],"methods":[26,74],"we":[27],"fuse":[30],"audio":[31,117],"representations":[34],"between":[35,114],"two":[36],"unimodal":[37,129],"deep-learning":[38],"models.":[39],"However,":[40],"there":[41],"lack":[44],"of":[45,56,65,122,127],"accommodation":[46],"for":[47,93,144],"modalities":[48],"that":[49],"have":[50],"disparity":[52],"amount":[55,64],"computational":[57,163],"resources":[58],"needed":[59],"provide":[61],"same":[63],"temporal":[66],"information.":[67],"As":[68],"sequence":[70,125],"length":[71],"increases,":[72],"current":[73],"often":[75],"make":[76],"simplifications":[77],"such":[78],"as":[79],"discarding":[80],"frames":[81],"or":[82],"cropping":[83],"sequence.":[85],"This":[86],"paper":[87],"introduces":[88],"chunking":[90],"methodology":[91],"designed":[92],"cross-attention-based":[94],"transformer":[96],"architectures.":[97],"The":[98],"approach":[99],"involves":[100],"segmenting":[101],"input\u2014the":[104],"more":[105],"computationally":[106],"demanding":[107],"modality\u2014into":[108],"chunks.":[109],"Cross-attention":[110],"then":[112],"performed":[113],"encoded":[116],"features":[120],"instead":[121],"original":[124],"lengths":[126],"backbones.":[130],"Our":[131],"method":[132],"achieves":[133],"significant":[134],"improvements":[135],"over":[136],"conventional":[137],"cross-attention":[138],"techniques":[139],"audio-visual":[142],"domain":[143],"six-class":[146],"emotional":[147],"problem,":[149],"demonstrating":[150],"better":[151],"F1":[152],"score,":[153],"precision,":[154],"recall":[156],"on":[157],"CREMA-D":[159],"database":[160],"while":[161],"reducing":[162],"overhead.":[164]},"counts_by_year":[],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
