{"id":"https://openalex.org/W4389299497","doi":"https://doi.org/10.1145/3633781","title":"Efficient Video Transformers via Spatial-temporal Token Merging for Action Recognition","display_name":"Efficient Video Transformers via Spatial-temporal Token Merging for Action Recognition","publication_year":2023,"publication_date":"2023-12-04","ids":{"openalex":"https://openalex.org/W4389299497","doi":"https://doi.org/10.1145/3633781"},"language":"en","primary_location":{"id":"doi:10.1145/3633781","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3633781","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101647310","display_name":"Zhanzhou Feng","orcid":"https://orcid.org/0009-0006-0071-454X"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhanzhou Feng","raw_affiliation_strings":["National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, China"],"affiliations":[{"raw_affiliation_string":"National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101904779","display_name":"Jiaming Xu","orcid":"https://orcid.org/0009-0009-5142-9177"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaming Xu","raw_affiliation_strings":["School of Electronic Engineering and Computer Science, Peking University, China"],"affiliations":[{"raw_affiliation_string":"School of Electronic Engineering and Computer Science, Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100462123","display_name":"\u041b\u0435\u0439 \u041c\u0430","orcid":"https://orcid.org/0000-0001-6024-3854"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]},{"id":"https://openalex.org/I4210100255","display_name":"Beijing Academy of Artificial Intelligence","ror":"https://ror.org/016a74861","country_code":"CN","type":"other","lineage":["https://openalex.org/I4210100255"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Lei Ma","raw_affiliation_strings":["National Biomedical Imaging Center, College of Future Technology, Peking University, National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, \rBeijing Academy of Artificial Intelligence, China"],"affiliations":[{"raw_affiliation_string":"National Biomedical Imaging Center, College of Future Technology, Peking University, National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, \rBeijing Academy of Artificial Intelligence, China","institution_ids":["https://openalex.org/I4210100255","https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055433405","display_name":"Shiliang Zhang","orcid":"https://orcid.org/0000-0001-9053-9314"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shiliang Zhang","raw_affiliation_strings":["National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, China"],"affiliations":[{"raw_affiliation_string":"National Key Laboratory for Multimedia Information Processing, School of Computer Science, Peking University, China","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101647310"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":1.4303,"has_fulltext":false,"cited_by_count":12,"citation_normalized_percentile":{"value":0.84425363,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"20","issue":"4","first_page":"1","last_page":"21"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12740","display_name":"Gait Recognition and Analysis","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11227","display_name":"Diabetic Foot Ulcer Assessment and Management","score":0.9843999743461609,"subfield":{"id":"https://openalex.org/subfields/2712","display_name":"Endocrinology, Diabetes and Metabolism"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8200471997261047},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.8123538494110107},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6159161329269409},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5882136225700378},{"id":"https://openalex.org/keywords/merge","display_name":"Merge (version control)","score":0.5489691495895386},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5088789463043213},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.4664907455444336},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.425085186958313},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4193662405014038},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.09674513339996338}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8200471997261047},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.8123538494110107},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6159161329269409},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5882136225700378},{"id":"https://openalex.org/C197129107","wikidata":"https://www.wikidata.org/wiki/Q1921621","display_name":"Merge (version control)","level":2,"score":0.5489691495895386},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5088789463043213},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.4664907455444336},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.425085186958313},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4193662405014038},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.09674513339996338},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3633781","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3633781","pdf_url":null,"source":{"id":"https://openalex.org/S19610489","display_name":"ACM Transactions on Multimedia Computing Communications and Applications","issn_l":"1551-6857","issn":["1551-6857","1551-6865"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Multimedia Computing, Communications, and Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1121271761","display_name":null,"funder_award_id":"Program","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1231421488","display_name":null,"funder_award_id":"under","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G1743894166","display_name":null,"funder_award_id":"U20B2052, 61936011","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G2082596075","display_name":null,"funder_award_id":"2018YFE0118400","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G2087396116","display_name":null,"funder_award_id":"China","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3317480652","display_name":null,"funder_award_id":"Science","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G3637388763","display_name":null,"funder_award_id":"U20B2052","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G5994120800","display_name":null,"funder_award_id":"Natural","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G6658023004","display_name":null,"funder_award_id":"61936011","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7726157001","display_name":null,"funder_award_id":"Grant No.","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":59,"referenced_works":["https://openalex.org/W2625366777","https://openalex.org/W2896457183","https://openalex.org/W2899030863","https://openalex.org/W2961553857","https://openalex.org/W2963091558","https://openalex.org/W2963155035","https://openalex.org/W2963370182","https://openalex.org/W2963526497","https://openalex.org/W2974686944","https://openalex.org/W2976669726","https://openalex.org/W2981385151","https://openalex.org/W2990152177","https://openalex.org/W2990503944","https://openalex.org/W3004505825","https://openalex.org/W3034572008","https://openalex.org/W3035104321","https://openalex.org/W3035303837","https://openalex.org/W3092462694","https://openalex.org/W3094502228","https://openalex.org/W3096609285","https://openalex.org/W3121052081","https://openalex.org/W3131500599","https://openalex.org/W3168124404","https://openalex.org/W3170841864","https://openalex.org/W3172345956","https://openalex.org/W3172908893","https://openalex.org/W3173621652","https://openalex.org/W3177141386","https://openalex.org/W3205497712","https://openalex.org/W4214516465","https://openalex.org/W4214588794","https://openalex.org/W4214612132","https://openalex.org/W4221156361","https://openalex.org/W4226133174","https://openalex.org/W4281749424","https://openalex.org/W4287122452","https://openalex.org/W4287792756","https://openalex.org/W4304732370","https://openalex.org/W4306886919","https://openalex.org/W4307639905","https://openalex.org/W4310877696","https://openalex.org/W4311412445","https://openalex.org/W4311559518","https://openalex.org/W4312560592","https://openalex.org/W4313156423","https://openalex.org/W4313476633","https://openalex.org/W4315491145","https://openalex.org/W4319452844","https://openalex.org/W4322716158","https://openalex.org/W4323897056","https://openalex.org/W4376647170","https://openalex.org/W4384159680","https://openalex.org/W6777047548","https://openalex.org/W6794642395","https://openalex.org/W6796750486","https://openalex.org/W6803028189","https://openalex.org/W6803870738","https://openalex.org/W6846263370","https://openalex.org/W6846577953"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W4307309205","https://openalex.org/W3016124757","https://openalex.org/W2981757109","https://openalex.org/W4283332100","https://openalex.org/W4361193049"],"abstract_inverted_index":{"Transformer":[0,169],"has":[1],"exhibited":[2],"promising":[3,202],"performance":[4,219],"in":[5,16,37,107,120],"various":[6,187],"video":[7,29,83,188],"recognition":[8,33,195],"tasks":[9],"but":[10],"brings":[11,224],"a":[12,50,70,140,176,217],"huge":[13],"computational":[14,235],"cost":[15],"modeling":[17],"spatial-temporal":[18,130],"cues.":[19],"This":[20,154],"work":[21],"aims":[22],"to":[23,60,65,100,164,183],"boost":[24],"the":[25,62,66,82,102,121,126,158,173,193,208,233],"efficiency":[26],"of":[27,45,104,129,132,143,151,160,186,211,227],"existing":[28],"transformers":[30],"for":[31,116],"action":[32,194],"through":[34],"eliminating":[35],"redundancies":[36],"their":[38],"tokens":[39,63,91,94,106,133,144,161],"and":[40,52,92,98,110,148,171,198],"efficiently":[41],"learning":[42],"motion":[43,117,135,149],"cues":[44,118],"moving":[46,152],"objects.":[47,153],"We":[48],"propose":[49],"lightweight":[51],"plug-and-play":[53],"module,":[54,178],"namely":[55],"Spatial-temporal":[56],"Token":[57],"Merger":[58],"(STTM),":[59],"merge":[61],"belonging":[64],"same":[67,234],"object":[68,79],"into":[69],"more":[71],"compact":[72,141],"representation.":[73],"STTM":[74,137,179],"first":[75],"adaptively":[76],"identifies":[77],"crucial":[78],"clues":[80],"underlying":[81],"as":[84,134],"meta":[85,93],"tokens.":[86],"Similarity":[87],"scores":[88],"between":[89],"input":[90],"are":[95],"hence":[96,138],"computed":[97],"used":[99],"guide":[101],"fusion":[103],"similar":[105,218],"both":[108,146],"spatial":[109],"temporal":[111],"domains,":[112],"respectively.":[113],"To":[114],"compensate":[115],"lost":[119],"merging":[122],"procedure,":[123],"we":[124],"compute":[125],"linear":[127],"aggregation":[128],"positions":[131],"features.":[136],"outputs":[139],"set":[142],"fusing":[145],"appearance":[147],"features":[150],"procedure":[155],"substantially":[156],"decreases":[157],"number":[159],"that":[162],"need":[163],"be":[165,181],"processed":[166],"by":[167,213],"each":[168],"block":[170],"boosts":[172],"efficiency.":[174],"As":[175],"general":[177],"can":[180],"applied":[182],"different":[184],"layers":[185],"Transformers.":[189],"Extensive":[190],"experiments":[191],"on":[192,220,230],"datasets":[196],"Kinectics-400":[197],"SSv2":[199,231],"demonstrate":[200],"its":[201],"performance.":[203],"For":[204],"example,":[205],"it":[206],"reduces":[207],"computation":[209],"complexity":[210],"ViT":[212],"38%":[214],"while":[215],"maintaining":[216],"Kinectics-400.":[221],"It":[222],"also":[223],"1.7%":[225],"gains":[226],"top-1":[228],"accuracy":[229],"under":[232],"cost.":[236]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":2}],"updated_date":"2026-04-13T07:58:08.660418","created_date":"2025-10-10T00:00:00"}
