{"id":"https://openalex.org/W4416214881","doi":"https://doi.org/10.1109/tmm.2025.3632670","title":"UniAlign: A Universal Cross-Modality Knowledge Alignment Framework for Fine-Grained Action Recognition","display_name":"UniAlign: A Universal Cross-Modality Knowledge Alignment Framework for Fine-Grained Action Recognition","publication_year":2025,"publication_date":"2025-11-14","ids":{"openalex":"https://openalex.org/W4416214881","doi":"https://doi.org/10.1109/tmm.2025.3632670"},"language":null,"primary_location":{"id":"doi:10.1109/tmm.2025.3632670","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3632670","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100428935","display_name":"Yihan Wang","orcid":"https://orcid.org/0000-0002-5353-6250"},"institutions":[{"id":"https://openalex.org/I2801561730","display_name":"Liaoning University of International Business and Economics","ror":"https://ror.org/03m6hya42","country_code":"CN","type":"education","lineage":["https://openalex.org/I2801561730"]},{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yihan Wang","raw_affiliation_strings":["DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China"],"affiliations":[{"raw_affiliation_string":"DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]},{"raw_affiliation_string":"DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China","institution_ids":["https://openalex.org/I27357992","https://openalex.org/I2801561730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020149992","display_name":"Baoli Sun","orcid":"https://orcid.org/0000-0002-2861-4288"},"institutions":[{"id":"https://openalex.org/I2801561730","display_name":"Liaoning University of International Business and Economics","ror":"https://ror.org/03m6hya42","country_code":"CN","type":"education","lineage":["https://openalex.org/I2801561730"]},{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Baoli Sun","raw_affiliation_strings":["DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China"],"affiliations":[{"raw_affiliation_string":"DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]},{"raw_affiliation_string":"DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China","institution_ids":["https://openalex.org/I27357992","https://openalex.org/I2801561730"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100616471","display_name":"Haojie Li","orcid":"https://orcid.org/0000-0003-3882-2205"},"institutions":[{"id":"https://openalex.org/I80143920","display_name":"Shandong University of Science and Technology","ror":"https://ror.org/04gtjhw98","country_code":"CN","type":"education","lineage":["https://openalex.org/I80143920"]},{"id":"https://openalex.org/I4210134794","display_name":"Shandong Institute of Quantum Science and Technology","ror":"https://ror.org/042pabj96","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210134794"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haojie Li","raw_affiliation_strings":["Shandong Science and Technology, Qingdao, China","Shandong Science and Technology, China"],"affiliations":[{"raw_affiliation_string":"Shandong Science and Technology, Qingdao, China","institution_ids":["https://openalex.org/I80143920"]},{"raw_affiliation_string":"Shandong Science and Technology, China","institution_ids":["https://openalex.org/I4210134794"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030101527","display_name":"Xinzhu Ma","orcid":"https://orcid.org/0000-0003-0504-0186"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinzhu Ma","raw_affiliation_strings":["Beihang University, Beijing, China","Beihang University, China"],"affiliations":[{"raw_affiliation_string":"Beihang University, Beijing, China","institution_ids":["https://openalex.org/I82880672"]},{"raw_affiliation_string":"Beihang University, China","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043228501","display_name":"Zhihui Wang","orcid":"https://orcid.org/0000-0002-8024-7753"},"institutions":[{"id":"https://openalex.org/I129604602","display_name":"University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]},{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]}],"countries":["AU","CN"],"is_corresponding":false,"raw_author_name":"Zhihui Wang","raw_affiliation_strings":["DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","University of Sydney, Camperdown, NSW, Australia","University of Sydney, Australia"],"affiliations":[{"raw_affiliation_string":"DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]},{"raw_affiliation_string":"University of Sydney, Camperdown, NSW, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"University of Sydney, Australia","institution_ids":["https://openalex.org/I129604602"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5070657279","display_name":"Z X Wang","orcid":"https://orcid.org/0009-0001-8207-4294"},"institutions":[{"id":"https://openalex.org/I27357992","display_name":"Dalian University of Technology","ror":"https://ror.org/023hj5876","country_code":"CN","type":"education","lineage":["https://openalex.org/I27357992"]},{"id":"https://openalex.org/I2801561730","display_name":"Liaoning University of International Business and Economics","ror":"https://ror.org/03m6hya42","country_code":"CN","type":"education","lineage":["https://openalex.org/I2801561730"]},{"id":"https://openalex.org/I129604602","display_name":"University of Sydney","ror":"https://ror.org/0384j8v12","country_code":"AU","type":"education","lineage":["https://openalex.org/I129604602"]}],"countries":["AU","CN"],"is_corresponding":false,"raw_author_name":"Zhiyong Wang","raw_affiliation_strings":["DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","University of Sydney, Camperdown, NSW, Australia","DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China"],"affiliations":[{"raw_affiliation_string":"DUT-RU International School of Information Science &amp; Engineering, Dalian University of Technology, Dalian, China","institution_ids":["https://openalex.org/I27357992"]},{"raw_affiliation_string":"University of Sydney, Camperdown, NSW, Australia","institution_ids":["https://openalex.org/I129604602"]},{"raw_affiliation_string":"DUT-RU International School of Infor mation Science &amp; Engineering, Dalian University of Technology, Liaoning, China","institution_ids":["https://openalex.org/I27357992","https://openalex.org/I2801561730"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100428935"],"corresponding_institution_ids":["https://openalex.org/I27357992","https://openalex.org/I2801561730"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.37003571,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"28","issue":null,"first_page":"891","last_page":"901"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9825000166893005,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.003000000026077032,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.002899999963119626,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6222000122070312},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.6018999814987183},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.5807999968528748},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4812000095844269},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.47839999198913574},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.45500001311302185},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4514000117778778},{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.44209998846054077},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4406000077724457}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8752999901771545},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.708899974822998},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6222000122070312},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.6018999814987183},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.5807999968528748},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4812000095844269},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.47839999198913574},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.45500001311302185},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4514000117778778},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.44209998846054077},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4406000077724457},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.43130001425743103},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.37229999899864197},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.3513999879360199},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.33500000834465027},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3240000009536743},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3199000060558319},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C2779903281","wikidata":"https://www.wikidata.org/wiki/Q6888026","display_name":"Modalities","level":2,"score":0.30979999899864197},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C83665646","wikidata":"https://www.wikidata.org/wiki/Q42139305","display_name":"Feature vector","level":2,"score":0.2694000005722046},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.25540000200271606},{"id":"https://openalex.org/C64876066","wikidata":"https://www.wikidata.org/wiki/Q5141226","display_name":"Cognitive neuroscience of visual object recognition","level":3,"score":0.2524999976158142}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tmm.2025.3632670","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tmm.2025.3632670","pdf_url":null,"source":{"id":"https://openalex.org/S137030581","display_name":"IEEE Transactions on Multimedia","issn_l":"1520-9210","issn":["1520-9210","1941-0077"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Multimedia","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":54,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W1536680647","https://openalex.org/W2049125285","https://openalex.org/W2616247523","https://openalex.org/W2752401453","https://openalex.org/W2799211965","https://openalex.org/W2895243423","https://openalex.org/W2916798096","https://openalex.org/W2944006115","https://openalex.org/W2963076553","https://openalex.org/W2963315828","https://openalex.org/W2976818183","https://openalex.org/W2981851019","https://openalex.org/W2981923053","https://openalex.org/W2990152177","https://openalex.org/W2990503944","https://openalex.org/W3002271958","https://openalex.org/W3034399482","https://openalex.org/W3035180180","https://openalex.org/W3035225512","https://openalex.org/W3035303837","https://openalex.org/W3109392981","https://openalex.org/W3126721948","https://openalex.org/W3162565403","https://openalex.org/W3175624454","https://openalex.org/W3184346264","https://openalex.org/W3195639294","https://openalex.org/W3203634062","https://openalex.org/W3203711169","https://openalex.org/W3206930349","https://openalex.org/W3216270236","https://openalex.org/W4205605843","https://openalex.org/W4214612132","https://openalex.org/W4214614183","https://openalex.org/W4225414521","https://openalex.org/W4285606530","https://openalex.org/W4304080586","https://openalex.org/W4312245820","https://openalex.org/W4312257792","https://openalex.org/W4312558481","https://openalex.org/W4312560592","https://openalex.org/W4312614039","https://openalex.org/W4312658081","https://openalex.org/W4312982586","https://openalex.org/W4313482657","https://openalex.org/W4319299930","https://openalex.org/W4386072015","https://openalex.org/W4386075819","https://openalex.org/W4386076398","https://openalex.org/W4390872434","https://openalex.org/W4390874024","https://openalex.org/W4391383802","https://openalex.org/W4394593089","https://openalex.org/W4403842425"],"related_works":[],"abstract_inverted_index":{"The":[0],"key":[1],"to":[2,26,87,105,124,179,184],"fine-grained":[3,69,212],"video":[4,145],"action":[5,12,21,31,70,115,141,213],"recognition":[6,116,214],"is":[7],"identifying":[8],"subtle":[9],"differences":[10],"between":[11],"categories.":[13],"Relying":[14],"solely":[15],"on":[16,209],"visual":[17],"features":[18,127],"supervised":[19],"by":[20,195],"labels":[22],"makes":[23],"it":[24],"challenging":[25],"characterize":[27],"robust":[28,84],"and":[29,42,52,57,128,136,149,220],"discriminative":[30],"dynamics":[32],"from":[33,109],"videos.":[34],"With":[35],"significant":[36],"advancements":[37],"in":[38,68,76],"human":[39,54],"pose":[40,55,126,134,147],"estimation":[41],"the":[43,73,107,132,140,162,186,191,203,225,228],"powerful":[44],"capabilities":[45],"of":[46,164,227],"Vision-Language":[47],"Models":[48],"(VLMs),":[49],"obtaining":[50],"reliable":[51],"cost-free":[53],"data":[56],"textual":[58,129,150,187],"semantics":[59,130],"has":[60],"become":[61],"increasingly":[62],"feasible,":[63],"enabling":[64],"their":[65],"effective":[66],"use":[67],"recognition.":[71],"However,":[72],"inherent":[74],"disparities":[75],"feature":[77],"representations":[78,193],"across":[79],"different":[80,165],"modalities":[81],"necessitate":[82],"a":[83,97,154,175],"alignment":[85,101],"strategy":[86],"achieve":[88],"opti":[89],"mal":[90],"fusion.":[91],"To":[92,138],"address":[93],"this,":[94],"we":[95,152,173],"propose":[96,153],"universal":[98],"cross-modality":[99],"knowledge":[100,108],"framework,":[102],"namely":[103],"UniAlign,":[104],"transfer":[106],"such":[110],"pre-trained":[111,133],"multi-modal":[112],"models":[113],"into":[114],"models.":[117],"Specifically,":[118],"UniAlign":[119,230],"introduces":[120],"two":[121],"additional":[122],"branches":[123],"extract":[125],"with":[131],"encoder":[135],"VLM.":[137],"align":[139],"relevant":[142],"cues":[143,167],"among":[144],"features,":[146,148],"semantics,":[151,188],"Cross-Modality":[155],"Similarity":[156],"Aggregation":[157],"module":[158],"(CMSA)":[159],"that":[160,190],"utilizes":[161],"importance":[163],"modal":[166],"while":[168,199],"aggregating":[169],"cross-modal":[170],"similarities.":[171],"Additionally,":[172],"adopt":[174],"fine-tuning":[176],"mechanism":[177],"similar":[178],"Exponential":[180],"Moving":[181],"Average":[182],"(EMA)":[183],"refine":[185],"ensuring":[189],"semantic":[192],"encoded":[194],"VLMs":[196],"are":[197],"preserved":[198],"being":[200],"optimized":[201],"towards":[202],"specific":[204],"task":[205],"preferences.":[206],"Extensive":[207],"experiments":[208],"widely":[210],"used":[211],"benchmarks":[215],"(e.g.,":[216],"FineGym,":[217],"NTURGB-D,":[218],"Diving48)":[219],"coarse-grained":[221],"K400":[222],"dataset":[223],"demonstrate":[224],"effectiveness":[226],"proposed":[229],"method.":[231]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-11-14T00:00:00"}
