{"id":"https://openalex.org/W7138154496","doi":"https://doi.org/10.1609/aaai.v40i16.38318","title":"Learning Procedural-Aware Video Representations Through State-Grounded Hierarchy Unfolding","display_name":"Learning Procedural-Aware Video Representations Through State-Grounded Hierarchy Unfolding","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138154496","doi":"https://doi.org/10.1609/aaai.v40i16.38318"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i16.38318","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i16.38318","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1609/aaai.v40i16.38318","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129745878","display_name":"Jinghan Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jinghan Zhao","raw_affiliation_strings":["State Key Laboratory of VR Technology and Systems, School of CSE, Beihang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of VR Technology and Systems, School of CSE, Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129751349","display_name":"Yifei Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I74801974","display_name":"The University of Tokyo","ror":"https://ror.org/057zh3y96","country_code":"JP","type":"education","lineage":["https://openalex.org/I74801974"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yifei Huang","raw_affiliation_strings":["The University of Tokyo, Japan"],"affiliations":[{"raw_affiliation_string":"The University of Tokyo, Japan","institution_ids":["https://openalex.org/I74801974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129748575","display_name":"Feng Lu","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Lu","raw_affiliation_strings":["State Key Laboratory of VR Technology and Systems, School of CSE, Beihang University"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of VR Technology and Systems, School of CSE, Beihang University","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5129745878"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.47164179,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"16","first_page":"13172","last_page":"13180"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7778000235557556,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7778000235557556,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.08349999785423279,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.061900001019239426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.6916000247001648},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6481999754905701},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6395999789237976},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.6234999895095825},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.531000018119812},{"id":"https://openalex.org/keywords/forcing","display_name":"Forcing (mathematics)","score":0.46619999408721924},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.46050000190734863},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.42890000343322754},{"id":"https://openalex.org/keywords/task-analysis","display_name":"Task analysis","score":0.4244999885559082}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8382999897003174},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6916000247001648},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6481999754905701},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6395999789237976},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.6234999895095825},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5598000288009644},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.531000018119812},{"id":"https://openalex.org/C197115733","wikidata":"https://www.wikidata.org/wiki/Q1003136","display_name":"Forcing (mathematics)","level":2,"score":0.46619999408721924},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.46050000190734863},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.42890000343322754},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.4244999885559082},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.4230000078678131},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.40860000252723694},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3271999955177307},{"id":"https://openalex.org/C2779439875","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Natural language understanding","level":3,"score":0.32100000977516174},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.31150001287460327},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.3093999922275543},{"id":"https://openalex.org/C22367795","wikidata":"https://www.wikidata.org/wiki/Q7625208","display_name":"Structured prediction","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2856999933719635},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.28450000286102295},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.2671999931335449},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.2583000063896179},{"id":"https://openalex.org/C32848918","wikidata":"https://www.wikidata.org/wiki/Q845789","display_name":"Observable","level":2,"score":0.2567000091075897},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i16.38318","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i16.38318","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i16.38318","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i16.38318","pdf_url":null,"source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.6361529231071472,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Learning":[0],"procedural-aware":[1],"video":[2,43],"representations":[3,142],"is":[4,189],"a":[5,60,84,94,104,127,190],"key":[6,191],"step":[7,36,176,180],"towards":[8],"building":[9],"agents":[10],"that":[11,88,115,131,163,185],"can":[12,96],"reason":[13],"about":[14],"and":[15,35,54,159,178],"execute":[16],"complex":[17],"tasks.":[18,153,198],"Existing":[19],"methods":[20],"typically":[21],"address":[22,72],"this":[23,101,123],"problem":[24],"by":[25],"aligning":[26],"visual":[27,69],"content":[28],"with":[29,63,148],"textual":[30,78],"descriptions":[31,56],"at":[32],"the":[33,64,133,137,157,215],"task":[34,174],"levels":[37],"to":[38,47,58,92,139],"inject":[39],"procedural":[40],"semantics":[41],"into":[42],"representations.":[44],"However,":[45],"due":[46],"their":[48],"high":[49],"level":[50],"of":[51,80,193],"abstraction,":[52],"\"task\"":[53],"\"step\"":[55],"fail":[57],"form":[59],"robust":[61],"alignment":[62],"concrete,":[65],"observable":[66,119],"details":[67],"in":[68,103,143],"data.":[70],"To":[71,121],"this,":[73],"we":[74,125],"introduce":[75],"\"states\",":[76],"i.e.,":[77],"snapshots":[79],"object":[81],"configurations,":[82],"as":[83,211],"visually-grounded":[85],"semantic":[86],"layer":[87],"anchors":[89],"abstract":[90],"procedures":[91],"what":[93],"model":[95,138],"actually":[97],"see.":[98],"We":[99],"formalize":[100],"insight":[102],"novel":[105],"Task-Step-State":[106],"(TSS)":[107],"framework,":[108],"where":[109],"tasks":[110],"are":[111],"achieved":[112],"via":[113],"steps":[114,149],"drive":[116],"transitions":[117],"between":[118],"states.":[120],"enforce":[122],"structure,":[124],"propose":[126],"progressive":[128,201],"pre-training":[129],"strategy":[130,203],"unfolds":[132],"TSS":[134],"hierarchy,":[135],"forcing":[136],"first":[140],"ground":[141],"states":[144],"before":[145],"associating":[146],"them":[147],"and,":[150],"ultimately,":[151],"high-level":[152],"Extensive":[154],"experiments":[155],"on":[156,169],"COIN":[158],"CrossTask":[160],"datasets":[161],"show":[162,184],"our":[164,200],"method":[165],"outperforms":[166],"baseline":[167],"models":[168],"multiple":[170],"downstream":[171],"tasks,":[172],"including":[173],"recognition,":[175,177],"next":[179],"prediction.":[181],"Ablation":[182],"studies":[183],"introducing":[186],"state":[187],"supervision":[188],"driver":[192],"performance":[194],"gains":[195],"across":[196],"all":[197],"Additionally,":[199],"pretraining":[202],"proves":[204],"more":[205],"effective":[206],"than":[207],"standard":[208],"joint":[209],"training,":[210],"it":[212],"better":[213],"enforces":[214],"intended":[216],"hierarchical":[217],"structure.":[218]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
