{"id":"https://openalex.org/W2896144051","doi":"https://doi.org/10.1145/3266302.3266306","title":"Learning an Arousal-Valence Speech Front-End Network using Media Data In-the-Wild for Emotion Recognition","display_name":"Learning an Arousal-Valence Speech Front-End Network using Media Data In-the-Wild for Emotion Recognition","publication_year":2018,"publication_date":"2018-10-15","ids":{"openalex":"https://openalex.org/W2896144051","doi":"https://doi.org/10.1145/3266302.3266306","mag":"2896144051"},"language":"en","primary_location":{"id":"doi:10.1145/3266302.3266306","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3266302.3266306","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 on Audio/Visual Emotion Challenge and Workshop","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071511841","display_name":"Chih-Chuan Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chih-Chuan Lu","raw_affiliation_strings":["National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc","institution_ids":["https://openalex.org/I25846049"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001894880","display_name":"Jeng-Lin Li","orcid":"https://orcid.org/0000-0002-9261-1524"},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Jeng-Lin Li","raw_affiliation_strings":["National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc","institution_ids":["https://openalex.org/I25846049"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5086107623","display_name":"Chi-Chun Lee","orcid":"https://orcid.org/0000-0003-0186-4321"},"institutions":[{"id":"https://openalex.org/I25846049","display_name":"National Tsing Hua University","ror":"https://ror.org/00zdnkx70","country_code":"TW","type":"education","lineage":["https://openalex.org/I25846049"]}],"countries":["TW"],"is_corresponding":false,"raw_author_name":"Chi-Chun Lee","raw_affiliation_strings":["National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Tsing Hua University &amp; MOST Joint Research Center for AI Technology and All Vista Healthcare, Hsinchu City, Taiwan Roc","institution_ids":["https://openalex.org/I25846049"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I25846049"],"apc_list":null,"apc_paid":null,"fwci":1.5329,"has_fulltext":false,"cited_by_count":21,"citation_normalized_percentile":{"value":0.84006563,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"99","last_page":"105"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.8567192554473877},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7785789966583252},{"id":"https://openalex.org/keywords/valence","display_name":"Valence (chemistry)","score":0.6400626301765442},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.5607819557189941},{"id":"https://openalex.org/keywords/emotion-recognition","display_name":"Emotion recognition","score":0.5571969151496887},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5553942322731018},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5243692398071289},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.5214701890945435},{"id":"https://openalex.org/keywords/arousal","display_name":"Arousal","score":0.47842150926589966},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.44101226329803467},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.421608567237854},{"id":"https://openalex.org/keywords/front-and-back-ends","display_name":"Front and back ends","score":0.4173981249332428},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4136381447315216}],"concepts":[{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.8567192554473877},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7785789966583252},{"id":"https://openalex.org/C168900304","wikidata":"https://www.wikidata.org/wiki/Q171407","display_name":"Valence (chemistry)","level":2,"score":0.6400626301765442},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.5607819557189941},{"id":"https://openalex.org/C2777438025","wikidata":"https://www.wikidata.org/wiki/Q1339090","display_name":"Emotion recognition","level":2,"score":0.5571969151496887},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5553942322731018},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5243692398071289},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.5214701890945435},{"id":"https://openalex.org/C36951298","wikidata":"https://www.wikidata.org/wiki/Q379784","display_name":"Arousal","level":2,"score":0.47842150926589966},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.44101226329803467},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.421608567237854},{"id":"https://openalex.org/C53016008","wikidata":"https://www.wikidata.org/wiki/Q620167","display_name":"Front and back ends","level":2,"score":0.4173981249332428},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4136381447315216},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3266302.3266306","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3266302.3266306","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2018 on Audio/Visual Emotion Challenge and Workshop","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5199999809265137,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W35931001","https://openalex.org/W149362470","https://openalex.org/W189922968","https://openalex.org/W1522301498","https://openalex.org/W1997817608","https://openalex.org/W2014399678","https://openalex.org/W2015645277","https://openalex.org/W2028899742","https://openalex.org/W2050752817","https://openalex.org/W2083738729","https://openalex.org/W2087054143","https://openalex.org/W2093174546","https://openalex.org/W2097726431","https://openalex.org/W2109074779","https://openalex.org/W2134670479","https://openalex.org/W2146334809","https://openalex.org/W2149933564","https://openalex.org/W2169493788","https://openalex.org/W2171683557","https://openalex.org/W2181741066","https://openalex.org/W2253429366","https://openalex.org/W2284846286","https://openalex.org/W2285811434","https://openalex.org/W2295001676","https://openalex.org/W2460586963","https://openalex.org/W2468785836","https://openalex.org/W2489406233","https://openalex.org/W2511566646","https://openalex.org/W2578895956","https://openalex.org/W2595663683","https://openalex.org/W2598207902","https://openalex.org/W2602034649","https://openalex.org/W2615063356","https://openalex.org/W2616400984","https://openalex.org/W2626312329","https://openalex.org/W2749459361","https://openalex.org/W2757184610","https://openalex.org/W2765998482","https://openalex.org/W2785952417","https://openalex.org/W2794426516","https://openalex.org/W2946138111","https://openalex.org/W2962835968","https://openalex.org/W3013377280","https://openalex.org/W3158317474","https://openalex.org/W4205184193"],"related_works":["https://openalex.org/W3204184292","https://openalex.org/W3176564347","https://openalex.org/W1985458517","https://openalex.org/W2355833770","https://openalex.org/W2593532750","https://openalex.org/W3207232378","https://openalex.org/W103414482","https://openalex.org/W2642467014","https://openalex.org/W2889523925","https://openalex.org/W3080080512"],"abstract_inverted_index":{"Recent":[0],"progress":[1],"in":[2,23],"speech":[3,65],"emotion":[4,24,112,128],"recognition":[5],"(SER)":[6],"technology":[7],"has":[8],"benefited":[9],"from":[10,84],"the":[11,64,91,106,110,130,134,145,153,164,176,189],"use":[12],"of":[13,33,114,123,152,175],"deep":[14],"learning":[15,63],"techniques.":[16],"However,":[17],"expensive":[18],"human":[19],"annotation":[20],"and":[21,86,133,143],"difficulty":[22],"database":[25],"collection":[26],"make":[27],"it":[28,169],"challenging":[29],"for":[30,109,125],"rapid":[31],"deployment":[32],"SER":[34,60,124],"across":[35],"diverse":[36],"application":[37],"domains.":[38],"An":[39],"initialization":[40,55,141],"-":[41],"fine-tuning":[42,187],"strategy":[43],"help":[44],"mitigate":[45],"these":[46],"technical":[47],"challenges.":[48],"In":[49],"this":[50],"work,":[51],"we":[52],"propose":[53],"an":[54],"network":[56,67,185],"that":[57,80,160],"gears":[58],"toward":[59],"applications":[61],"by":[62,162],"front-end":[66,167],"on":[68,121,183,188],"a":[69],"large":[70],"media":[71],"data":[72,178],"collected":[73],"in-the-wild":[74],"jointly":[75],"with":[76,105,186],"proxy":[77],"arousal-valence":[78],"labels":[79],"are":[81],"multimodally":[82],"derived":[83],"audio":[85],"text":[87],"information,":[88],"termed":[89],"as":[90,166,171,173],"Arousal-Valence":[92],"Speech":[93],"Front-End":[94],"Network":[95],"(AV-SpNET).":[96],"The":[97,137],"AV-SpNET":[98,120,138,165],"can":[99],"then":[100],"be":[101],"easily":[102],"stacked":[103],"simply":[104],"supervised":[107],"layers":[108],"target":[111],"corpus":[113],"interest.":[115],"We":[116,157],"evaluate":[117],"our":[118],"proposed":[119],"tasks":[122],"two":[126],"separate":[127],"corpora,":[129],"USC":[131],"IEMOCAP":[132],"NNIME":[135],"database.":[136],"outperforms":[139],"other":[140],"techniques":[142],"reach":[144],"best":[146],"overall":[147],"performances":[148],"requiring":[149],"only":[150],"75%":[151],"in-domain":[154],"annotated":[155],"data.":[156],"also":[158],"observe":[159],"generally,":[161],"using":[163],"network,":[168],"requires":[170],"little":[172],"50%":[174],"fine-tuned":[177],"to":[179],"surpass":[180],"method":[181],"based":[182],"randomly-initialized":[184],"complete":[190],"training":[191],"set.":[192]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":3}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
