{"id":"https://openalex.org/W7154587630","doi":"https://doi.org/10.48550/arxiv.2604.13448","title":"A Study of Failure Modes in Two-Stage Human-Object Interaction Detection","display_name":"A Study of Failure Modes in Two-Stage Human-Object Interaction Detection","publication_year":2026,"publication_date":"2026-04-15","ids":{"openalex":"https://openalex.org/W7154587630","doi":"https://doi.org/10.48550/arxiv.2604.13448"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.13448","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13448","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.13448","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5057460156","display_name":"Lemeng Wang","orcid":"https://orcid.org/0009-0003-7043-4023"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Lemeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102519894","display_name":"Qinqian Lei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei, Qinqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055176657","display_name":"Vidhi Bakshi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bakshi, Vidhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104189816","display_name":"Daniel Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yi, Daniel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133740450","display_name":"Yifan Liu","orcid":"https://orcid.org/0009-0002-4026-672X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133807105","display_name":"Jiacheng Hou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hou, Jiacheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133736826","display_name":"Asher Seng Hao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hao, Asher Seng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066572795","display_name":"Zheda Mai","orcid":"https://orcid.org/0000-0003-3703-563X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mai, Zheda","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133782028","display_name":"Wei-Lun Chao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chao, Wei-Lun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133774110","display_name":"Robby T. Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Robby T.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133736546","display_name":"Bo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Bo","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5057460156"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9483000040054321,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9483000040054321,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.012600000016391277,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.007400000002235174,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6675999760627747},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.558899998664856},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.49070000648498535},{"id":"https://openalex.org/keywords/object-detection","display_name":"Object detection","score":0.35269999504089355},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.32440000772476196},{"id":"https://openalex.org/keywords/multiple-models","display_name":"Multiple Models","score":0.3188000023365021}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6675999760627747},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6625999808311462},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.558899998664856},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5407000184059143},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.49070000648498535},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4708999991416931},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.35269999504089355},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.32440000772476196},{"id":"https://openalex.org/C2779714256","wikidata":"https://www.wikidata.org/wiki/Q25305062","display_name":"Multiple Models","level":2,"score":0.3188000023365021},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29089999198913574},{"id":"https://openalex.org/C12426560","wikidata":"https://www.wikidata.org/wiki/Q189569","display_name":"Basis (linear algebra)","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2793000042438507},{"id":"https://openalex.org/C66283442","wikidata":"https://www.wikidata.org/wiki/Q1389268","display_name":"Failure mode and effects analysis","level":2,"score":0.272599995136261}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.13448","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13448","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.13448","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.13448","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Human-object":[0],"interaction":[1,56],"(HOI)":[2],"detection":[3,83,95],"aims":[4],"to":[5,65,107,142,151],"detect":[6],"interactions":[7,131],"between":[8],"humans":[9],"and":[10,31,54,100,132,135,162,197],"objects":[11],"in":[12,48,203],"images.":[13],"While":[14],"recent":[15],"advances":[16],"have":[17],"improved":[18],"performance":[19,171],"on":[20,27],"existing":[21,122],"benchmarks,":[22],"their":[23,164],"evaluations":[24],"mainly":[25],"focus":[26],"overall":[28,169],"prediction":[29],"accuracy":[30],"provide":[32,188],"limited":[33],"insight":[34],"into":[35,96,191],"the":[36,68,77,192],"underlying":[37],"causes":[38],"of":[39,71,79,111,118,194],"model":[40,102,137],"failures.":[41],"In":[42,58],"particular,":[43],"modern":[44],"models":[45,156,196],"often":[46],"struggle":[47],"complex":[49],"scenes":[50],"involving":[51],"multiple":[52,97],"people":[53],"rare":[55],"combinations.":[57],"this":[59,185,204],"work,":[60],"we":[61,91],"present":[62],"a":[63,88,116],"study":[64,108,186],"better":[66],"understand":[67],"failure":[69,112,145],"modes":[70],"two-stage":[72],"HOI":[73,82,94,123,155,195],"models,":[74],"which":[75],"form":[76],"basis":[78],"many":[80],"current":[81],"approaches.":[84],"Rather":[85],"than":[86],"constructing":[87],"large-scale":[89],"benchmark,":[90],"instead":[92],"decompose":[93],"interpretable":[98],"perspectives":[99],"analyze":[101,136,152],"behavior":[103,138],"across":[104],"these":[105,140,154],"dimensions":[106],"different":[109,144,159],"types":[110],"patterns.":[113],"We":[114,182],"curate":[115],"subset":[117],"images":[119],"from":[120],"an":[121],"dataset":[124],"organized":[125],"by":[126],"human-object-interaction":[127],"configurations":[128,141],"(e.g.,":[129],"multi-person":[130],"object":[133],"sharing),":[134],"under":[139,158],"examine":[143],"modes.":[146],"This":[147],"design":[148],"allows":[149],"us":[150],"how":[153],"behave":[157],"scene":[160],"compositions":[161],"why":[163],"predictions":[165],"fail.":[166],"Importantly,":[167],"high":[168],"benchmark":[170],"does":[172],"not":[173],"necessarily":[174],"reflect":[175],"robust":[176],"visual":[177],"reasoning":[178],"about":[179],"human-object":[180],"relationships.":[181],"hope":[183],"that":[184],"can":[187],"useful":[189],"insights":[190],"limitations":[193],"offer":[198],"observations":[199],"for":[200],"future":[201],"research":[202],"area.":[205]},"counts_by_year":[],"updated_date":"2026-04-17T06:04:52.305304","created_date":"2026-04-17T00:00:00"}
