{"id":"https://openalex.org/W7160519668","doi":"https://doi.org/10.48550/arxiv.2605.05057","title":"ScriptHOI: Learning Scripted State Transitions for Open-Vocabulary Human-Object Interaction Detection","display_name":"ScriptHOI: Learning Scripted State Transitions for Open-Vocabulary Human-Object Interaction Detection","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160519668","doi":"https://doi.org/10.48550/arxiv.2605.05057"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05057","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049946115","display_name":"Mai Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Minh Anh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135635304","display_name":"Quang Huy Tran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tran, Quang Huy","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135343905","display_name":"B\u1ea3o Ng\u1ecdc L\u00ea","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Le, Bao Ngoc","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134668971","display_name":"Suiyang Guang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guang, SuiYang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025005935","display_name":"Tuan Kiet Pham","orcid":"https://orcid.org/0009-0009-6661-7138"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pham, Tuan Kiet","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135614075","display_name":"Linh Chi Vo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vo, Linh Chi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9549000263214111,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9549000263214111,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.013500000350177288,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.004800000227987766,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/phrase","display_name":"Phrase","score":0.6243000030517578},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.58160001039505},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5644999742507935},{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.5375000238418579},{"id":"https://openalex.org/keywords/state","display_name":"State (computer science)","score":0.5171999931335449},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.47760000824928284},{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.41760000586509705},{"id":"https://openalex.org/keywords/interval","display_name":"Interval (graph theory)","score":0.4058000147342682}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7531999945640564},{"id":"https://openalex.org/C2776224158","wikidata":"https://www.wikidata.org/wiki/Q187931","display_name":"Phrase","level":2,"score":0.6243000030517578},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.58160001039505},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.569599986076355},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5644999742507935},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.5375000238418579},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.5171999931335449},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.47760000824928284},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4603999853134155},{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.41760000586509705},{"id":"https://openalex.org/C2778067643","wikidata":"https://www.wikidata.org/wiki/Q166507","display_name":"Interval (graph theory)","level":2,"score":0.4058000147342682},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.36550000309944153},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.34779998660087585},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.33309999108314514},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3230000138282776},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C94915269","wikidata":"https://www.wikidata.org/wiki/Q1834857","display_name":"Detector","level":2,"score":0.295199990272522},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26249998807907104},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.26170000433921814},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.2533000111579895}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05057","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05057","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6467233896255493,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Open-vocabulary":[0],"human-object":[1,27,124],"interaction":[2,7,87,215],"(HOI)":[3],"detection":[4],"requires":[5],"recognizing":[6],"phrases":[8],"that":[9,84,209],"may":[10,49],"not":[11],"appear":[12],"as":[13,89,100],"annotated":[14],"categories":[15],"during":[16],"training.":[17],"Recent":[18],"vision-language":[19],"HOI":[20,145,206],"detectors":[21],"improve":[22],"semantic":[23],"transfer":[24],"by":[25,38],"matching":[26],"features":[28],"with":[29,175],"text":[30],"embeddings,":[31],"but":[32,162],"their":[33],"predictions":[34],"are":[35],"often":[36],"dominated":[37],"object":[39,72],"affordance":[40],"and":[41,59,71,114,130,138,151,178,204,213],"phrase-level":[42],"co-occurrence.":[43],"As":[44],"a":[45,47,57,60,81,90,98,101,131],"result,":[46],"model":[48],"predict":[50],"\\textit{cut":[51],"cake}":[52],"from":[53],"the":[54,65,76],"presence":[55],"of":[56,183],"knife":[58],"cake":[61],"without":[62],"verifying":[63],"whether":[64],"hand,":[66],"tool,":[67],"target,":[68],"contact":[69],"pattern,":[70],"state":[73,93,119,128],"jointly":[74],"support":[75],"action.":[77],"We":[78],"propose":[79],"\\textbf{ScriptHOI},":[80],"structured":[82],"framework":[83],"represents":[85],"each":[86,122],"phrase":[88,99],"soft":[91],"scripted":[92],"transition.":[94],"Rather":[95],"than":[96],"treating":[97],"single":[102],"class":[103],"token,":[104],"ScriptHOI":[105,210],"decomposes":[106],"it":[107],"into":[108,126],"body-role,":[109],"contact,":[110],"geometry,":[111],"affordance,":[112],"motion,":[113],"object-state":[115],"slots.":[116],"A":[117,187],"visual":[118,149],"tokenizer":[120],"parses":[121],"detected":[123],"pair":[125],"corresponding":[127],"tokens,":[129],"slot-wise":[132],"matcher":[133],"estimates":[134],"both":[135],"script":[136,139,189,194],"coverage":[137],"conflict.":[140],"These":[141],"two":[142],"quantities":[143],"calibrate":[144],"logits,":[146],"expose":[147],"missing":[148],"evidence,":[150],"provide":[152],"training":[153],"constraints":[154],"for":[155],"incomplete":[156],"annotations.":[157],"To":[158],"avoid":[159],"suppressing":[160],"valid":[161],"unannotated":[163,173],"interactions,":[164],"we":[165],"further":[166],"introduce":[167],"interval":[168],"partial-label":[169],"learning,":[170],"which":[171],"constrains":[172],"candidates":[174],"script-derived":[176],"lower":[177],"upper":[179],"probability":[180],"bounds":[181],"instead":[182],"assigning":[184],"closed-world":[185],"negatives.":[186],"counterfactual":[188],"contrast":[190],"loss":[191],"swaps":[192],"individual":[193],"slots":[195],"to":[196],"discourage":[197],"object-only":[198],"shortcuts.":[199],"Experiments":[200],"on":[201],"HICO-DET,":[202],"V-COCO,":[203],"open-vocabulary":[205],"splits":[207],"show":[208],"improves":[211],"rare":[212],"unseen":[214],"recognition":[216],"while":[217],"substantially":[218],"reducing":[219],"affordance-conflict":[220],"false":[221],"positives.":[222]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-08T00:00:00"}
