{"id":"https://openalex.org/W4387724935","doi":"https://doi.org/10.1145/3581783.3612848","title":"Advancing Audio Emotion and Intent Recognition with Large Pre-Trained Models and Bayesian Inference","display_name":"Advancing Audio Emotion and Intent Recognition with Large Pre-Trained Models and Bayesian Inference","publication_year":2023,"publication_date":"2023-10-26","ids":{"openalex":"https://openalex.org/W4387724935","doi":"https://doi.org/10.1145/3581783.3612848"},"language":"en","primary_location":{"id":"doi:10.1145/3581783.3612848","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3581783.3612848","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3581783.3612848","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3581783.3612848","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052292328","display_name":"Dejan Porjazovski","orcid":"https://orcid.org/0000-0002-7219-9042"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":true,"raw_author_name":"Dejan Porjazovski","raw_affiliation_strings":["Aalto University, Espoo, Finland"],"raw_orcid":"https://orcid.org/0000-0002-7219-9042","affiliations":[{"raw_affiliation_string":"Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082953089","display_name":"Yaroslav Getman","orcid":"https://orcid.org/0000-0003-4680-8294"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Yaroslav Getman","raw_affiliation_strings":["Aalto University, Espoo, Finland"],"raw_orcid":"https://orcid.org/0000-0003-4680-8294","affiliations":[{"raw_affiliation_string":"Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058277119","display_name":"Tam\u00e1s Gr\u00f3sz","orcid":"https://orcid.org/0000-0001-7918-9579"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Tam\u00e1s Gr\u00f3sz","raw_affiliation_strings":["Aalto University, Espoo, Finland"],"raw_orcid":"https://orcid.org/0000-0001-7918-9579","affiliations":[{"raw_affiliation_string":"Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043424064","display_name":"Mikko Kurimo","orcid":"https://orcid.org/0000-0001-5278-7974"},"institutions":[{"id":"https://openalex.org/I9927081","display_name":"Aalto University","ror":"https://ror.org/020hwjq30","country_code":"FI","type":"education","lineage":["https://openalex.org/I9927081"]}],"countries":["FI"],"is_corresponding":false,"raw_author_name":"Mikko Kurimo","raw_affiliation_strings":["Aalto University, Espoo, Finland"],"raw_orcid":"https://orcid.org/0000-0001-5278-7974","affiliations":[{"raw_affiliation_string":"Aalto University, Espoo, Finland","institution_ids":["https://openalex.org/I9927081"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5052292328"],"corresponding_institution_ids":["https://openalex.org/I9927081"],"apc_list":null,"apc_paid":null,"fwci":0.7676,"has_fulltext":true,"cited_by_count":4,"citation_normalized_percentile":{"value":0.71377283,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"9477","last_page":"9481"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7032681107521057},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6637911200523376},{"id":"https://openalex.org/keywords/bayesian-inference","display_name":"Bayesian inference","score":0.5689986944198608},{"id":"https://openalex.org/keywords/bayesian-probability","display_name":"Bayesian probability","score":0.5618804693222046},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.558036744594574},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5074941515922546},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4699787199497223},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.46519649028778076}],"concepts":[{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7032681107521057},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6637911200523376},{"id":"https://openalex.org/C160234255","wikidata":"https://www.wikidata.org/wiki/Q812535","display_name":"Bayesian inference","level":3,"score":0.5689986944198608},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.5618804693222046},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.558036744594574},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5074941515922546},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4699787199497223},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.46519649028778076}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3581783.3612848","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3581783.3612848","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3581783.3612848","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2310.10179","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2310.10179","pdf_url":"https://arxiv.org/pdf/2310.10179","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:aaltodoc.aalto.fi:123456789/124906","is_oa":true,"landing_page_url":"https://research.aalto.fi/en/publications/e1152904-9028-445a-80cc-ad69e20f89ba","pdf_url":null,"source":{"id":"https://openalex.org/S4306401662","display_name":"Aaltodoc (Aalto University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I9927081","host_organization_name":"Aalto University","host_organization_lineage":["https://openalex.org/I9927081"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"publishedVersion"}],"best_oa_location":{"id":"doi:10.1145/3581783.3612848","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3581783.3612848","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3581783.3612848","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Multimedia","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.7799999713897705,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[{"id":"https://openalex.org/G1536121294","display_name":"Common Language Resources and Technology Infrastructure","funder_award_id":"337073","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"},{"id":"https://openalex.org/G3490526257","display_name":null,"funder_award_id":"337073","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"},{"id":"https://openalex.org/G7154773589","display_name":null,"funder_award_id":"345790","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"},{"id":"https://openalex.org/G8125348452","display_name":null,"funder_award_id":"37073,345790","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"},{"id":"https://openalex.org/G8267031437","display_name":"Understanding speech and scene with ears and eyes","funder_award_id":"345790","funder_id":"https://openalex.org/F4320321108","funder_display_name":"Academy of Finland"}],"funders":[{"id":"https://openalex.org/F4320321108","display_name":"Academy of Finland","ror":"https://ror.org/05k73zm37"},{"id":"https://openalex.org/F4320321394","display_name":"Aalto-Yliopisto","ror":"https://ror.org/020hwjq30"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387724935.pdf","grobid_xml":"https://content.openalex.org/works/W4387724935.grobid-xml"},"referenced_works_count":26,"referenced_works":["https://openalex.org/W2746419079","https://openalex.org/W2905841571","https://openalex.org/W2909021043","https://openalex.org/W2963194800","https://openalex.org/W2963288440","https://openalex.org/W2963341956","https://openalex.org/W2989982989","https://openalex.org/W3030356194","https://openalex.org/W3036601975","https://openalex.org/W3103187652","https://openalex.org/W3104574805","https://openalex.org/W3167533889","https://openalex.org/W3178946670","https://openalex.org/W3197642003","https://openalex.org/W3198429080","https://openalex.org/W3202370288","https://openalex.org/W4286841926","https://openalex.org/W4294562888","https://openalex.org/W4295312788","https://openalex.org/W4296068413","https://openalex.org/W4297841770","https://openalex.org/W4300485806","https://openalex.org/W4304087162","https://openalex.org/W4311000453","https://openalex.org/W4317672847","https://openalex.org/W4387968410"],"related_works":["https://openalex.org/W3126677997","https://openalex.org/W1610857240","https://openalex.org/W2372267530","https://openalex.org/W2969189870","https://openalex.org/W3015855446","https://openalex.org/W2965643117","https://openalex.org/W4303857162","https://openalex.org/W2407375987","https://openalex.org/W3049691116","https://openalex.org/W2505726097"],"abstract_inverted_index":{"Large":[0],"pre-trained":[1,24],"models":[2,25],"are":[3],"essential":[4],"in":[5,10,115],"paralinguistic":[6],"systems,":[7],"demonstrating":[8],"effectiveness":[9],"tasks":[11],"like":[12],"emotion":[13],"recognition":[14],"and":[15,36,43,48,91],"stuttering":[16],"detection.":[17],"In":[18],"this":[19,116],"paper,":[20],"we":[21,67,133],"employ":[22],"large":[23],"for":[26,98],"the":[27,34,56,59,63,76,99,104,125,141],"ACM":[28],"Multimedia":[29],"Computational":[30],"Paralinguistics":[31],"Challenge,":[32],"addressing":[33],"Requests":[35],"Emotion":[37,100],"Share":[38,101],"tasks.":[39],"We":[40],"explore":[41],"audio-only":[42,64],"hybrid":[44,60],"solutions":[45],"leveraging":[46],"audio":[47],"text":[49],"modalities.":[50],"Our":[51],"empirical":[52],"results":[53],"consistently":[54],"show":[55],"superiority":[57],"of":[58,108,127,140],"approaches":[61],"over":[62],"models.":[65],"Moreover,":[66,132],"introduce":[68],"a":[69],"Bayesian":[70,111],"layer":[71],"as":[72],"an":[73,86],"alternative":[74],"to":[75,120],"standard":[77],"linear":[78],"output":[79],"layer.":[80],"The":[81,95,110],"multimodal":[82],"fusion":[83],"approach":[84],"achieves":[85],"85.4%":[87],"UAR":[88],"on":[89,93],"HC-Requests":[90],"60.2%":[92],"HC-Complaints.":[94],"ensemble":[96],"model":[97],"task":[102],"yields":[103],"best":[105],"\u03c1":[106],"value":[107],".614.":[109],"wav2vec2":[112],"approach,":[113],"explored":[114],"study,":[117],"allows":[118],"us":[119],"easily":[121],"build":[122],"ensembles,":[123],"at":[124],"cost":[126],"fine-tuning":[128],"only":[129],"one":[130],"model.":[131],"can":[134],"have":[135],"usable":[136],"confidence":[137],"values":[138],"instead":[139],"usual":[142],"overconfident":[143],"posterior":[144],"probabilities.":[145]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
