{"id":"https://openalex.org/W7131071969","doi":"https://doi.org/10.1109/iccvw69036.2025.00281","title":"Learning Robust Aligned Representations Across Multiple Visual Modalities in Human Action Recognition","display_name":"Learning Robust Aligned Representations Across Multiple Visual Modalities in Human Action Recognition","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W7131071969","doi":"https://doi.org/10.1109/iccvw69036.2025.00281"},"language":"en","primary_location":{"id":"doi:10.1109/iccvw69036.2025.00281","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00281","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102858494","display_name":"David Lerch","orcid":"https://orcid.org/0009-0008-6707-3059"},"institutions":[{"id":"https://openalex.org/I4210111500","display_name":"Fraunhofer Institute of Optronics, System Technologies and Image Exploitation","ror":"https://ror.org/01zx97922","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210111500","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"David J. Lerch","raw_affiliation_strings":["Fraunhofer IOSB,Karlsruhe,Germany"],"affiliations":[{"raw_affiliation_string":"Fraunhofer IOSB,Karlsruhe,Germany","institution_ids":["https://openalex.org/I4210111500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125465742","display_name":"Bastian Rothenburger","orcid":null},"institutions":[{"id":"https://openalex.org/I114090438","display_name":"Goethe University Frankfurt","ror":"https://ror.org/04cvxnb49","country_code":"DE","type":"education","lineage":["https://openalex.org/I114090438"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bastian Rothenburger","raw_affiliation_strings":["Goethe University Frankfurt,Frankfurt,Germany"],"affiliations":[{"raw_affiliation_string":"Goethe University Frankfurt,Frankfurt,Germany","institution_ids":["https://openalex.org/I114090438"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025665858","display_name":"Zeyun Zhong","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111500","display_name":"Fraunhofer Institute of Optronics, System Technologies and Image Exploitation","ror":"https://ror.org/01zx97922","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210111500","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Zeyun Zhong","raw_affiliation_strings":["Fraunhofer IOSB,Karlsruhe,Germany"],"affiliations":[{"raw_affiliation_string":"Fraunhofer IOSB,Karlsruhe,Germany","institution_ids":["https://openalex.org/I4210111500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039656710","display_name":"Manuel Gil Mart\u00edn","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111500","display_name":"Fraunhofer Institute of Optronics, System Technologies and Image Exploitation","ror":"https://ror.org/01zx97922","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210111500","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Manuel Martin","raw_affiliation_strings":["Fraunhofer IOSB,Karlsruhe,Germany"],"affiliations":[{"raw_affiliation_string":"Fraunhofer IOSB,Karlsruhe,Germany","institution_ids":["https://openalex.org/I4210111500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121799974","display_name":"Frederik Diederichs","orcid":null},"institutions":[{"id":"https://openalex.org/I4210111500","display_name":"Fraunhofer Institute of Optronics, System Technologies and Image Exploitation","ror":"https://ror.org/01zx97922","country_code":"DE","type":"facility","lineage":["https://openalex.org/I4210111500","https://openalex.org/I4923324"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Frederik Diederichs","raw_affiliation_strings":["Fraunhofer IOSB,Karlsruhe,Germany"],"affiliations":[{"raw_affiliation_string":"Fraunhofer IOSB,Karlsruhe,Germany","institution_ids":["https://openalex.org/I4210111500"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5126643275","display_name":"Rainer Stiefelhagen","orcid":null},"institutions":[{"id":"https://openalex.org/I102335020","display_name":"Karlsruhe Institute of Technology","ror":"https://ror.org/04t3en479","country_code":"DE","type":"education","lineage":["https://openalex.org/I102335020","https://openalex.org/I1305996414"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Rainer Stiefelhagen","raw_affiliation_strings":["Karlsruhe Institute of Technology (KIT),Karlsruhe,Germany"],"affiliations":[{"raw_affiliation_string":"Karlsruhe Institute of Technology (KIT),Karlsruhe,Germany","institution_ids":["https://openalex.org/I102335020"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5102858494"],"corresponding_institution_ids":["https://openalex.org/I4210111500"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.75044567,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2700","last_page":"2710"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.920799970626831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.920799970626831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.031099999323487282,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11431","display_name":"Action Observation and Synchronization","score":0.006500000134110451,"subfield":{"id":"https://openalex.org/subfields/3207","display_name":"Social Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.607699990272522},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.6017000079154968},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.597100019454956},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.5073999762535095},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.5023000240325928},{"id":"https://openalex.org/keywords/modalities","display_name":"Modalities","score":0.48840001225471497},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.46149998903274536},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4334999918937683},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4124000072479248}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7835000157356262},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6809999942779541},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.607699990272522},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.6017000079154968},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.597100019454956},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.5073999762535095},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.5023000240325928},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.48840001225471497},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.46149998903274536},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.4065999984741211},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.38920000195503235},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.34950000047683716},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.326200008392334},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.28200000524520874},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2712000012397766},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C4668613","wikidata":"https://www.wikidata.org/wiki/Q4116110","display_name":"Structural alignment","level":5,"score":0.2703999876976013},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2556999921798706},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.25440001487731934},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/iccvw69036.2025.00281","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccvw69036.2025.00281","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)","raw_type":"proceedings-article"},{"id":"pmh:oai:publica.fraunhofer.de:publica/509090","is_oa":false,"landing_page_url":"https://publica.fraunhofer.de/handle/publica/509090","pdf_url":null,"source":{"id":"https://openalex.org/S4306400318","display_name":"Fraunhofer-Publica (Fraunhofer-Gesellschaft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4923324","host_organization_name":"Fraunhofer-Gesellschaft","host_organization_lineage":["https://openalex.org/I4923324"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":58,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1950788856","https://openalex.org/W2020163092","https://openalex.org/W2105101328","https://openalex.org/W2146634731","https://openalex.org/W2326925005","https://openalex.org/W2559833261","https://openalex.org/W2613570903","https://openalex.org/W2736334449","https://openalex.org/W2770240892","https://openalex.org/W2776217220","https://openalex.org/W2782057162","https://openalex.org/W2802861902","https://openalex.org/W2944006115","https://openalex.org/W2951183276","https://openalex.org/W2963076818","https://openalex.org/W2963288541","https://openalex.org/W2963524571","https://openalex.org/W2963563276","https://openalex.org/W2964134613","https://openalex.org/W2986674040","https://openalex.org/W3002271958","https://openalex.org/W3009762334","https://openalex.org/W3034697730","https://openalex.org/W3034996364","https://openalex.org/W3082819019","https://openalex.org/W3126721948","https://openalex.org/W3205898195","https://openalex.org/W4247726808","https://openalex.org/W4282981352","https://openalex.org/W4285061168","https://openalex.org/W4285555510","https://openalex.org/W4285606530","https://openalex.org/W4312245820","https://openalex.org/W4312372834","https://openalex.org/W4312558481","https://openalex.org/W4312620624","https://openalex.org/W4312757522","https://openalex.org/W4312841534","https://openalex.org/W4319299787","https://openalex.org/W4319299930","https://openalex.org/W4319300817","https://openalex.org/W4382240189","https://openalex.org/W4385245566","https://openalex.org/W4385805103","https://openalex.org/W4386071707","https://openalex.org/W4386072441","https://openalex.org/W4386215080","https://openalex.org/W4387968449","https://openalex.org/W4390871931","https://openalex.org/W4399404947","https://openalex.org/W4402670813","https://openalex.org/W4402727764","https://openalex.org/W4403842392","https://openalex.org/W4405778915","https://openalex.org/W4406813996","https://openalex.org/W4408713136","https://openalex.org/W4411150224"],"related_works":[],"abstract_inverted_index":{"We":[0,72],"propose":[1,73],"Cross-Modal":[2],"Video":[3],"Representation":[4],"Alignment":[5],"(CMVRA),":[6],"a":[7,74,125],"novel":[8],"framework":[9,78],"for":[10,63,179,193],"human":[11],"action":[12,194],"recognition":[13,65,132,162],"that":[14,79,99,142],"leverages":[15],"multiple":[16],"visual":[17],"modalities\u2014RGB,":[18],"infrared":[19],"(IR),":[20],"depth,":[21,82],"and":[22,52,69,84,89,114,149,163,187,198],"skeleton":[23,85],"data\u2014to":[24],"learn":[25],"robust,":[26],"generalizable":[27],"representations":[28,55],"with":[29],"reduced":[30],"reliance":[31],"on":[32,111,133,139,207],"annotated":[33],"data.":[34],"By":[35],"employing":[36],"contrastive":[37,169,185],"learning,":[38],"CMVRA":[39,123,143],"effectively":[40],"aligns":[41,80],"these":[42],"modalities,":[43],"enhancing":[44,87],"the":[45,112,118,145,156,165,188,199],"model's":[46],"ability":[47],"to":[48],"integrate":[49],"complementary":[50],"information":[51],"capture":[53],"richer":[54],"across":[56,171],"domains.":[57],"This":[58],"multi-modal":[59,76,102,160,184],"alignment":[60,95,103],"is":[61],"crucial":[62],"improving":[64],"performance":[66],"in":[67,128,183],"diverse":[68,172],"challenging":[70],"contexts.":[71],"unified":[75],"embedding":[77],"RGB,":[81],"infrared,":[83],"data,":[86],"robustness":[88],"feature":[90],"diversity,":[91],"while":[92],"also":[93],"advancing":[94],"techniques":[96],"by":[97],"demonstrating":[98],"fully":[100],"integrated":[101],"outperforms":[104],"traditional":[105],"pairwise":[106],"strategies.":[107],"Extensive":[108],"experiments":[109],"conducted":[110],"NTU":[113,140],"Drive&Act":[115],"datasets":[116],"confirm":[117],"effectiveness":[119],"of":[120,158,167,190],"our":[121],"approach.":[122],"achieves":[124],"3.01%":[126],"improvement":[127],"3D":[129],"skeleton-based":[130],"activity":[131,161],"Drive&Act,":[134],"outperforming":[135],"state-of-the-art":[136],"methods.":[137,152],"Experiments":[138],"show":[141],"closes":[144],"gap":[146],"between":[147],"self-supervised":[148,159],"supervised":[150],"learning":[151,170,186],"These":[153],"results":[154],"highlight":[155],"potential":[157],"emphasize":[164],"benefits":[166],"leveraging":[168],"modalities.":[173],"Our":[174,196],"findings":[175],"suggest":[176],"promising":[177],"directions":[178],"future":[180],"research,":[181],"particularly":[182],"integration":[189],"vision-language":[191],"models":[192],"recognition.":[195],"code":[197],"generated":[200],"video":[201],"captions":[202],"will":[203],"be":[204],"made":[205],"available":[206],"GitHub.":[208]},"counts_by_year":[],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2026-02-24T00:00:00"}
