{"id":"https://openalex.org/W4400527620","doi":"https://doi.org/10.1109/fg59268.2024.10581940","title":"Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice","display_name":"Social-MAE: A Transformer-Based Multimodal Autoencoder for Face and Voice","publication_year":2024,"publication_date":"2024-05-27","ids":{"openalex":"https://openalex.org/W4400527620","doi":"https://doi.org/10.1109/fg59268.2024.10581940"},"language":"en","primary_location":{"id":"doi:10.1109/fg59268.2024.10581940","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/fg59268.2024.10581940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://orbi.umons.ac.be/bitstream/20.500.12907/51664/1/FG2024_SocialMAE_preprint.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052633567","display_name":"Hugo Bohy","orcid":"https://orcid.org/0000-0003-2276-8494"},"institutions":[{"id":"https://openalex.org/I130929987","display_name":"University of Mons","ror":"https://ror.org/02qnnz951","country_code":"BE","type":"education","lineage":["https://openalex.org/I130929987"]}],"countries":["BE"],"is_corresponding":true,"raw_author_name":"Hugo Bohy","raw_affiliation_strings":["Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium"],"affiliations":[{"raw_affiliation_string":"Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium","institution_ids":["https://openalex.org/I130929987"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101612317","display_name":"Minh Tr\u1ea7n","orcid":"https://orcid.org/0000-0003-4637-6081"},"institutions":[{"id":"https://openalex.org/I4210087747","display_name":"Creative Technologies (United States)","ror":"https://ror.org/001qkb777","country_code":"US","type":"company","lineage":["https://openalex.org/I4210087747"]},{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Minh Tran","raw_affiliation_strings":["Institute for Creative Technologies, University of Southern California,Los Angeles,CA,USA"],"affiliations":[{"raw_affiliation_string":"Institute for Creative Technologies, University of Southern California,Los Angeles,CA,USA","institution_ids":["https://openalex.org/I4210087747","https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074305407","display_name":"Kevin El Haddad","orcid":"https://orcid.org/0000-0003-1465-6273"},"institutions":[{"id":"https://openalex.org/I130929987","display_name":"University of Mons","ror":"https://ror.org/02qnnz951","country_code":"BE","type":"education","lineage":["https://openalex.org/I130929987"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Kevin El Haddad","raw_affiliation_strings":["Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium"],"affiliations":[{"raw_affiliation_string":"Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium","institution_ids":["https://openalex.org/I130929987"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042439255","display_name":"Thierry Dutoit","orcid":"https://orcid.org/0000-0001-7024-2150"},"institutions":[{"id":"https://openalex.org/I130929987","display_name":"University of Mons","ror":"https://ror.org/02qnnz951","country_code":"BE","type":"education","lineage":["https://openalex.org/I130929987"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Thierry Dutoit","raw_affiliation_strings":["Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium"],"affiliations":[{"raw_affiliation_string":"Numediart Institute, University of Mons,ISIA Lab,Mons,Belgium","institution_ids":["https://openalex.org/I130929987"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024169758","display_name":"Mohammad Soleymani","orcid":"https://orcid.org/0000-0002-5873-1434"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]},{"id":"https://openalex.org/I4210087747","display_name":"Creative Technologies (United States)","ror":"https://ror.org/001qkb777","country_code":"US","type":"company","lineage":["https://openalex.org/I4210087747"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mohammad Soleymani","raw_affiliation_strings":["Institute for Creative Technologies, University of Southern California,Los Angeles,CA,USA"],"affiliations":[{"raw_affiliation_string":"Institute for Creative Technologies, University of Southern California,Los Angeles,CA,USA","institution_ids":["https://openalex.org/I4210087747","https://openalex.org/I1174212"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5052633567"],"corresponding_institution_ids":["https://openalex.org/I130929987"],"apc_list":null,"apc_paid":null,"fwci":0.2597,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.50743045,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9646000266075134,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9646000266075134,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9085000157356262,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9027000069618225,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.8166632652282715},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.6078511476516724},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.592948853969574},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5784142017364502},{"id":"https://openalex.org/keywords/face","display_name":"Face (sociological concept)","score":0.4758843183517456},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2955739200115204},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.1692860722541809},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.13064178824424744},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.09576395153999329},{"id":"https://openalex.org/keywords/sociology","display_name":"Sociology","score":0.08074599504470825}],"concepts":[{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.8166632652282715},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6078511476516724},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.592948853969574},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5784142017364502},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.4758843183517456},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2955739200115204},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.1692860722541809},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.13064178824424744},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.09576395153999329},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.08074599504470825},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/fg59268.2024.10581940","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/fg59268.2024.10581940","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG)","raw_type":"proceedings-article"},{"id":"pmh:oai:orbi.umons.ac.be:20.500.12907/51664","is_oa":true,"landing_page_url":"https://orbi.umons.ac.be/handle/20.500.12907/51664","pdf_url":"https://orbi.umons.ac.be/bitstream/20.500.12907/51664/1/FG2024_SocialMAE_preprint.pdf","source":{"id":"https://openalex.org/S7407055454","display_name":"ORBi UMONS","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition, FG 2024, 1-5 (2024-05-27); 2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG), Istanbul, Tur [Tur], 27-05-2024 => 31-05-2024","raw_type":"peer reviewed"}],"best_oa_location":{"id":"pmh:oai:orbi.umons.ac.be:20.500.12907/51664","is_oa":true,"landing_page_url":"https://orbi.umons.ac.be/handle/20.500.12907/51664","pdf_url":"https://orbi.umons.ac.be/bitstream/20.500.12907/51664/1/FG2024_SocialMAE_preprint.pdf","source":{"id":"https://openalex.org/S7407055454","display_name":"ORBi UMONS","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition, FG 2024, 1-5 (2024-05-27); 2024 IEEE 18th International Conference on Automatic Face and Gesture Recognition (FG), Istanbul, Tur [Tur], 27-05-2024 => 31-05-2024","raw_type":"peer reviewed"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.41999998688697815}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400527620.pdf","grobid_xml":"https://content.openalex.org/works/W4400527620.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W3013693939","https://openalex.org/W2159052453","https://openalex.org/W2566616303","https://openalex.org/W3131327266","https://openalex.org/W2734887215","https://openalex.org/W4297051394","https://openalex.org/W2752972570","https://openalex.org/W2145836866"],"abstract_inverted_index":{"Human":[0],"social":[1,43,67,89],"behaviors":[2],"are":[3,133],"inherently":[4],"multi-modal":[5],"necessitating":[6],"the":[7,76,85,123],"development":[8],"of":[9,32,54,65,78,125],"powerful":[10],"audiovisual":[11,24,42],"models":[12],"for":[13,118],"their":[14],"perception.":[15],"In":[16],"this":[17,79],"paper,":[18],"we":[19,46],"present":[20],"Social-MAE,":[21],"our":[22],"pre-trained":[23,40],"Masked":[25,35],"Autoencoder":[26],"based":[27],"on":[28,41,61,87,108],"an":[29],"extended":[30],"version":[31],"Contrastive":[33],"Audio-Visual":[34],"Auto-Encoder":[36],"(CAV-MAE),":[37],"which":[38],"is":[39],"data.":[44],"Specifically,":[45],"modify":[47],"CAV-MAE":[48],"to":[49],"receive":[50],"a":[51,62,71],"larger":[52],"number":[53],"frames":[55],"as":[56],"input":[57],"and":[58,83,90,99,112,115,130],"pre-train":[59],"it":[60],"large":[63],"dataset":[64],"human":[66],"interaction":[68],"(VoxCeleb2)":[69],"in":[70],"self-supervised":[72,127],"manner.":[73],"We":[74],"demonstrate":[75],"effectiveness":[77,124],"model":[80,86,104,131],"by":[81],"fine-tuning":[82],"evaluating":[84],"different":[88],"affective":[91],"downstream":[92],"tasks,":[93],"namely,":[94],"emotion":[95,110],"recognition,":[96],"laughter":[97,113],"detection":[98],"apparent":[100,119],"personality":[101,120],"estimation.":[102],"The":[103],"achieves":[105],"state-of-the-art":[106],"results":[107,117],"multimodal":[109],"recognition":[111,114],"competitive":[116],"estimation,":[121],"demonstrating":[122],"in-domain":[126],"pre-training.":[128],"Code":[129],"weight":[132],"available":[134],"here":[135],"https://github.com/HuBohy/SocialMAE.":[136]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
