{"id":"https://openalex.org/W4414110875","doi":"https://doi.org/10.1109/tpami.2025.3608284","title":"H <sub>2</sub> OT: Hierarchical Hourglass Tokenizer for Efficient Video Pose Transformers","display_name":"H <sub>2</sub> OT: Hierarchical Hourglass Tokenizer for Efficient Video Pose Transformers","publication_year":2025,"publication_date":"2025-09-10","ids":{"openalex":"https://openalex.org/W4414110875","doi":"https://doi.org/10.1109/tpami.2025.3608284","pmid":"https://pubmed.ncbi.nlm.nih.gov/40928906"},"language":"en","primary_location":{"id":"doi:10.1109/tpami.2025.3608284","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3608284","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://hdl.handle.net/11572/469011","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100362614","display_name":"Wenhao Li","orcid":"https://orcid.org/0000-0001-8048-2668"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Wenhao Li","raw_affiliation_strings":["State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]},{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Mengyuan Liu","orcid":"https://orcid.org/0000-0002-6332-8316"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyuan Liu","raw_affiliation_strings":["State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]},{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hong Liu","orcid":"https://orcid.org/0000-0002-7498-6541"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hong Liu","raw_affiliation_strings":["State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]},{"raw_affiliation_string":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen, China","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042680345","display_name":"Pichao Wang","orcid":"https://orcid.org/0000-0002-1430-0237"},"institutions":[{"id":"https://openalex.org/I1311688040","display_name":"Amazon (United States)","ror":"https://ror.org/04mv4n011","country_code":"US","type":"company","lineage":["https://openalex.org/I1311688040"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Pichao Wang","raw_affiliation_strings":["Amazon AGI, Seattle, WA, USA","Amazon Prime Video, USA"],"affiliations":[{"raw_affiliation_string":"Amazon AGI, Seattle, WA, USA","institution_ids":["https://openalex.org/I1311688040"]},{"raw_affiliation_string":"Amazon Prime Video, USA","institution_ids":["https://openalex.org/I1311688040"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023507910","display_name":"Shijian Lu","orcid":"https://orcid.org/0000-0002-6766-2506"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Shijian Lu","raw_affiliation_strings":["College of Computing and Data Science, Nanyang Technological University, Singapore","School of Computer Science and Engineering, Nanyang Technological University, Singapore"],"affiliations":[{"raw_affiliation_string":"College of Computing and Data Science, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]},{"raw_affiliation_string":"School of Computer Science and Engineering, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5027171279","display_name":"Nicu Sebe","orcid":"https://orcid.org/0000-0002-6597-7248"},"institutions":[{"id":"https://openalex.org/I193223587","display_name":"University of Trento","ror":"https://ror.org/05trd4x28","country_code":"IT","type":"education","lineage":["https://openalex.org/I193223587"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Nicu Sebe","raw_affiliation_strings":["University of Trento, Trento, Italy","University of Trento, Italy"],"affiliations":[{"raw_affiliation_string":"University of Trento, Trento, Italy","institution_ids":["https://openalex.org/I193223587"]},{"raw_affiliation_string":"University of Trento, Italy","institution_ids":["https://openalex.org/I193223587"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100362614"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.22841531,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"48","issue":"1","first_page":"512","last_page":"526"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10914","display_name":"Tactile and Sensory Interactions","score":0.9962000250816345,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11687","display_name":"Teleoperation and Haptic Systems","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7735999822616577},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6917999982833862},{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.6255000233650208},{"id":"https://openalex.org/keywords/hourglass","display_name":"Hourglass","score":0.6195999979972839},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.524399995803833},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.33809998631477356},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.3257000148296356}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8454999923706055},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7735999822616577},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6917999982833862},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.6255000233650208},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6244999766349792},{"id":"https://openalex.org/C127532173","wikidata":"https://www.wikidata.org/wiki/Q179904","display_name":"Hourglass","level":2,"score":0.6195999979972839},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5307999849319458},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.524399995803833},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.33809998631477356},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.3257000148296356},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.3100999891757965},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C3020199158","wikidata":"https://www.wikidata.org/wiki/Q210521","display_name":"High resolution","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.2572999894618988}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tpami.2025.3608284","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpami.2025.3608284","pdf_url":null,"source":{"id":"https://openalex.org/S199944782","display_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","issn_l":"0162-8828","issn":["0162-8828","1939-3539","2160-9292"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Pattern Analysis and Machine Intelligence","raw_type":"journal-article"},{"id":"pmid:40928906","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/40928906","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE transactions on pattern analysis and machine intelligence","raw_type":null},{"id":"pmh:oai:iris.unitn.it:11572/469011","is_oa":true,"landing_page_url":"https://hdl.handle.net/11572/469011","pdf_url":null,"source":{"id":"https://openalex.org/S4377196320","display_name":"Iris (University of Trento)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I193223587","host_organization_name":"University of Trento","host_organization_lineage":["https://openalex.org/I193223587"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/article"}],"best_oa_location":{"id":"pmh:oai:iris.unitn.it:11572/469011","is_oa":true,"landing_page_url":"https://hdl.handle.net/11572/469011","pdf_url":null,"source":{"id":"https://openalex.org/S4377196320","display_name":"Iris (University of Trento)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I193223587","host_organization_name":"University of Trento","host_organization_lineage":["https://openalex.org/I193223587"],"host_organization_lineage_names":[],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4935777162","display_name":null,"funder_award_id":"62373009","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https://openalex.org/G7230235002","display_name":null,"funder_award_id":"R24I6IR138","funder_id":"https://openalex.org/F4320320696","funder_display_name":"Agency for Science, Technology and Research"}],"funders":[{"id":"https://openalex.org/F4320320696","display_name":"Agency for Science, Technology and Research","ror":"https://ror.org/036wvzt09"},{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F7311471023","display_name":"NextGenerationEU","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Transformers":[0],"have":[1],"been":[2],"successfully":[3],"applied":[4],"in":[5,72,77],"the":[6,15,78,85,115,123,129,134,138,181,211,216],"field":[7],"of":[8,19,62,117,192,215],"video-based":[9],"3D":[10,49],"human":[11,50],"pose":[12,22,51,60,75,183,190],"estimation.":[13],"However,":[14],"high":[16,198],"computational":[17],"costs":[18],"these":[20],"video":[21,118],"transformers":[23],"(VPTs)":[24],"make":[25],"them":[26],"impractical":[27],"on":[28,128,159,205],"resource-constrained":[29],"devices.":[30],"In":[31,174],"this":[32],"paper,":[33],"we":[34],"present":[35],"a":[36,73,95,101,109,188],"hierarchical":[37],"plug-and-play":[38],"pruning-and-recovering":[39],"framework,":[40],"called":[41],"Hierarchical":[42],"Hourglass":[43],"Tokenizer":[44],"(H<sub>2</sub>OT),":[45],"for":[46,143],"efficient":[47],"transformer-based":[48],"estimation":[52,201],"from":[53],"videos.":[54],"H<sub>2</sub>OT":[55,177],"begins":[56],"with":[57,67,90],"progressively":[58],"pruning":[59,170],"tokens":[61,76,112,191],"redundant":[63],"frames":[64,194],"and":[65,82,100,162,171,187,200,213],"ends":[66],"recovering":[68],"full-length":[69,140],"sequences,":[70],"resulting":[71],"few":[74,110,189],"intermediate":[79],"transformer":[80],"blocks":[81],"thus":[83],"improving":[84],"model":[86],"efficiency.":[87],"It":[88],"works":[89],"two":[91],"key":[92],"modules,":[93],"namely,":[94],"Token":[96,102],"Pruning":[97],"Module":[98,104],"(TPM)":[99],"Recovering":[103],"(TRM).":[105],"TPM":[106],"dynamically":[107],"selects":[108],"representative":[111,193],"to":[113,137],"eliminate":[114],"redundancy":[116],"frames,":[119],"while":[120,165],"TRM":[121],"restores":[122],"detailed":[124],"spatio-temporal":[125],"information":[126],"based":[127],"selected":[130],"tokens,":[131],"thereby":[132],"expanding":[133],"network":[135],"output":[136],"original":[139],"temporal":[141],"resolution":[142],"fast":[144],"inference.":[145],"Our":[146],"method":[147],"is":[148,185],"general-purpose:":[149],"it":[150],"can":[151,195],"be":[152],"easily":[153],"incorporated":[154],"into":[155],"common":[156],"VPT":[157],"models":[158],"both":[160,197,210],"seq2seq":[161],"seq2frame":[163],"pipelines":[164],"effectively":[166],"accommodating":[167],"different":[168],"token":[169],"recovery":[172],"strategies.":[173],"addition,":[175],"our":[176],"reveals":[178],"that":[179],"maintaining":[180],"full":[182],"sequence":[184],"unnecessary,":[186],"achieve":[196],"efficiency":[199,214],"accuracy.":[202],"Extensive":[203],"experiments":[204],"multiple":[206],"benchmark":[207],"datasets":[208],"demonstrate":[209],"effectiveness":[212],"proposed":[217],"method.":[218]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
