{"id":"https://openalex.org/W4304080227","doi":"https://doi.org/10.1145/3503161.3547871","title":"IVT: An End-to-End Instance-guided Video Transformer for 3D Pose Estimation","display_name":"IVT: An End-to-End Instance-guided Video Transformer for 3D Pose Estimation","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304080227","doi":"https://doi.org/10.1145/3503161.3547871"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547871","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547871","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5086179961","display_name":"Zhongwei Qiu","orcid":null},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongwei Qiu","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071036534","display_name":"Qiansheng Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiansheng Yang","raw_affiliation_strings":["Baidu, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Baidu, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100696824","display_name":"Jian Wang","orcid":"https://orcid.org/0000-0002-4316-932X"},"institutions":[{"id":"https://openalex.org/I98301712","display_name":"Baidu (China)","ror":"https://ror.org/03vs3wt56","country_code":"CN","type":"company","lineage":["https://openalex.org/I98301712"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Wang","raw_affiliation_strings":["Baidu, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Baidu, Beijing, China","institution_ids":["https://openalex.org/I98301712"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016681542","display_name":"Dongmei Fu","orcid":"https://orcid.org/0000-0003-3918-9448"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dongmei Fu","raw_affiliation_strings":["University of Science and Technology Beijing, Beijing, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.3539,"has_fulltext":false,"cited_by_count":6,"citation_normalized_percentile":{"value":0.67154114,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"6174","last_page":"6182"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10531","display_name":"Advanced Vision and Imaging","score":0.9908999800682068,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.9908000230789185,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8355427980422974},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.7147820591926575},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.6776925325393677},{"id":"https://openalex.org/keywords/pose","display_name":"Pose","score":0.6313852071762085},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.5746843218803406},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5540329217910767},{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.478549063205719},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.47788679599761963},{"id":"https://openalex.org/keywords/end-to-end-principle","display_name":"End-to-end principle","score":0.4252471923828125},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.41929566860198975},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07814353704452515},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.07712355256080627}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8355427980422974},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.7147820591926575},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.6776925325393677},{"id":"https://openalex.org/C52102323","wikidata":"https://www.wikidata.org/wiki/Q1671968","display_name":"Pose","level":2,"score":0.6313852071762085},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.5746843218803406},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5540329217910767},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.478549063205719},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.47788679599761963},{"id":"https://openalex.org/C74296488","wikidata":"https://www.wikidata.org/wiki/Q2527392","display_name":"End-to-end principle","level":2,"score":0.4252471923828125},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.41929566860198975},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07814353704452515},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.07712355256080627},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3503161.3547871","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547871","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1861492603","https://openalex.org/W2032618685","https://openalex.org/W2101032778","https://openalex.org/W2548527721","https://openalex.org/W2771965516","https://openalex.org/W2795089319","https://openalex.org/W2798411580","https://openalex.org/W2809890486","https://openalex.org/W2895748257","https://openalex.org/W2956061722","https://openalex.org/W2962896489","https://openalex.org/W2965348796","https://openalex.org/W2972662547","https://openalex.org/W2975420824","https://openalex.org/W2981637078","https://openalex.org/W2990270790","https://openalex.org/W2997002052","https://openalex.org/W2998027150","https://openalex.org/W3014641072","https://openalex.org/W3034448411","https://openalex.org/W3035129432","https://openalex.org/W3035225512","https://openalex.org/W3106838237","https://openalex.org/W3106882556","https://openalex.org/W3107944344","https://openalex.org/W3116592456","https://openalex.org/W3124503747","https://openalex.org/W3126541466","https://openalex.org/W3136525061","https://openalex.org/W3175199633","https://openalex.org/W3176892444","https://openalex.org/W3189677397","https://openalex.org/W3205717647","https://openalex.org/W3206384369","https://openalex.org/W3207212285","https://openalex.org/W4214619583","https://openalex.org/W4312925317"],"related_works":["https://openalex.org/W2151749779","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4304700937","https://openalex.org/W3135266094","https://openalex.org/W4312069176","https://openalex.org/W4283332100"],"abstract_inverted_index":{"Video":[0,63],"3D":[1,9,79,107,169,186],"human":[2,12,111,131],"pose":[3,49,108,187],"estimation":[4,188],"aims":[5],"to":[6,133,159],"localize":[7],"the":[8,22,32,38,45,56,106,124,130,134,161,168,192],"coordinates":[10],"of":[11,47,95,104,109,126,171],"joints":[13],"from":[14,25,73,82,129,176],"videos.":[15],"Recent":[16],"transformer-based":[17],"approaches":[18],"focus":[19],"on":[20,183],"capturing":[21],"spatiotemporal":[23,69,147],"information":[24,72,118],"sequential":[26],"2D":[27,48],"poses,":[28],"which":[29,66],"cannot":[30],"model":[31],"contextual":[33,70,148],"depth":[34,40,71],"feature":[35],"effectively":[36,76],"since":[37,119],"visual":[39,74],"features":[41,75],"are":[42,121,141,174],"lost":[43],"in":[44,102],"step":[46],"estimation.":[50],"In":[51,85,150],"this":[52],"paper,":[53],"we":[54,87,152],"simplify":[55],"paradigm":[57],"into":[58,143],"an":[59],"end-to-end":[60],"framework,":[61],"Instance-guided":[62],"Transformer":[64],"(IVT),":[65],"enables":[67],"learning":[68,146],"and":[77,98],"predicts":[78],"poses":[80,170],"directly":[81],"video":[83,90],"frames.":[84],"particular,":[86],"firstly":[88],"formulate":[89],"frames":[91],"as":[92],"a":[93,110,154],"series":[94],"instance-guided":[96,156,177],"tokens":[97,114,140,178],"each":[99,172],"token":[100],"is":[101],"charge":[103],"predicting":[105],"instance.":[112],"These":[113],"contain":[115],"body":[116,136],"structure":[117],"they":[120],"extracted":[122],"by":[123,179],"guidance":[125],"joint":[127],"offsets":[128],"center":[132],"corresponding":[135],"joints.":[137],"Then,":[138],"these":[139],"sent":[142],"IVT":[144,194],"for":[145],"depth.":[149],"addition,":[151],"propose":[153],"cross-scale":[155],"attention":[157],"mechanism":[158],"handle":[160],"variational":[162],"scales":[163],"among":[164],"multiple":[165],"persons.":[166],"Finally,":[167],"person":[173],"decoded":[175],"coordinate":[180],"regression.":[181],"Experiments":[182],"three":[184],"widely-used":[185],"benchmarks":[189],"show":[190],"that":[191],"proposed":[193],"achieves":[195],"state-of-the-art":[196],"performances.":[197]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
