{"id":"https://openalex.org/W7164832842","doi":"https://doi.org/10.1145/3805622.3810614","title":"STeP-Net: A Spatio-Temporal Perception Network for Action Detection","display_name":"STeP-Net: A Spatio-Temporal Perception Network for Action Detection","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164832842","doi":"https://doi.org/10.1145/3805622.3810614"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810614","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810614","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810614","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5138656189","display_name":"Kunfang Song","orcid":"https://orcid.org/0000-0002-8631-4240"},"institutions":[{"id":"https://openalex.org/I4210119942","display_name":"Wuhan Textile University","ror":"https://ror.org/02jgsf398","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210119942"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Kunfang Song","raw_affiliation_strings":["School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"],"raw_orcid":"https://orcid.org/0000-0002-8631-4240","affiliations":[{"raw_affiliation_string":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China","institution_ids":["https://openalex.org/I4210119942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138647195","display_name":"Guowei Yan","orcid":"https://orcid.org/0009-0000-3640-470X"},"institutions":[{"id":"https://openalex.org/I4210119942","display_name":"Wuhan Textile University","ror":"https://ror.org/02jgsf398","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210119942"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guowei Yan","raw_affiliation_strings":["School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0000-3640-470X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China","institution_ids":["https://openalex.org/I4210119942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015953715","display_name":"Jiaqing Wang","orcid":"https://orcid.org/0000-0002-0477-3516"},"institutions":[{"id":"https://openalex.org/I4210119942","display_name":"Wuhan Textile University","ror":"https://ror.org/02jgsf398","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210119942"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiaqing Wang","raw_affiliation_strings":["School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0008-3796-6948","affiliations":[{"raw_affiliation_string":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China","institution_ids":["https://openalex.org/I4210119942"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120218404","display_name":"Shufen Ruan","orcid":"https://orcid.org/0009-0004-6445-7656"},"institutions":[{"id":"https://openalex.org/I4210119942","display_name":"Wuhan Textile University","ror":"https://ror.org/02jgsf398","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210119942"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shufen Ruan","raw_affiliation_strings":["School of Mathematics and Statistics, Wuhan Textile University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0004-6445-7656","affiliations":[{"raw_affiliation_string":"School of Mathematics and Statistics, Wuhan Textile University, Wuhan, China","institution_ids":["https://openalex.org/I4210119942"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5138673817","display_name":"Yanwen Wang","orcid":"https://orcid.org/0009-0002-3416-4356"},"institutions":[{"id":"https://openalex.org/I4210119942","display_name":"Wuhan Textile University","ror":"https://ror.org/02jgsf398","country_code":"CN","type":"education","lineage":["https://openalex.org/I4210119942"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanwen Wang","raw_affiliation_strings":["School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China"],"raw_orcid":"https://orcid.org/0009-0002-3416-4356","affiliations":[{"raw_affiliation_string":"School of Computer Science and Artificial Intelligence, Wuhan Textile University, Wuhan, China","institution_ids":["https://openalex.org/I4210119942"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93687934,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1102","last_page":"1110"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9463000297546387,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.010700000450015068,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.00570000009611249,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6915000081062317},{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.5170999765396118},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.4966999888420105},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.475600004196167},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.46230000257492065},{"id":"https://openalex.org/keywords/spatial-contextual-awareness","display_name":"Spatial contextual awareness","score":0.46230000257492065},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.44449999928474426},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4268999993801117},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.4250999987125397}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7439000010490417},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6915000081062317},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5788000226020813},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.5170999765396118},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.4966999888420105},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.475600004196167},{"id":"https://openalex.org/C64754055","wikidata":"https://www.wikidata.org/wiki/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.46230000257492065},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.46230000257492065},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.44449999928474426},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4268999993801117},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.4250999987125397},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.39899998903274536},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C77277458","wikidata":"https://www.wikidata.org/wiki/Q1969246","display_name":"Temporal database","level":2,"score":0.37059998512268066},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.3686999976634979},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3375000059604645},{"id":"https://openalex.org/C119666444","wikidata":"https://www.wikidata.org/wiki/Q5977280","display_name":"Temporal resolution","level":2,"score":0.32190001010894775},{"id":"https://openalex.org/C2776010242","wikidata":"https://www.wikidata.org/wiki/Q4677575","display_name":"Active perception","level":3,"score":0.31850001215934753},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3010999858379364},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.30090001225471497},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.295199990272522},{"id":"https://openalex.org/C9652623","wikidata":"https://www.wikidata.org/wiki/Q190109","display_name":"Field (mathematics)","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2913999855518341},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.26840001344680786},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.26649999618530273},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.26269999146461487}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3805622.3810614","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810614","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810614","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810614","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1522734439","https://openalex.org/W1923332106","https://openalex.org/W2342662179","https://openalex.org/W2618799552","https://openalex.org/W2883275382","https://openalex.org/W2947454602","https://openalex.org/W2955874753","https://openalex.org/W2962790054","https://openalex.org/W2963524571","https://openalex.org/W2963529931","https://openalex.org/W2990503944","https://openalex.org/W3034572008","https://openalex.org/W3096824106","https://openalex.org/W3109173645","https://openalex.org/W4214612132","https://openalex.org/W4214614183","https://openalex.org/W4312509322","https://openalex.org/W4312560592","https://openalex.org/W4319300079","https://openalex.org/W4380986300","https://openalex.org/W4386072300","https://openalex.org/W4388492046","https://openalex.org/W4390190620","https://openalex.org/W4390873033","https://openalex.org/W4391011752","https://openalex.org/W4393135371","https://openalex.org/W4400579958","https://openalex.org/W4401596686","https://openalex.org/W4402705429","https://openalex.org/W4405653032","https://openalex.org/W4408250502","https://openalex.org/W4409366837","https://openalex.org/W4409643763","https://openalex.org/W4412939856","https://openalex.org/W4414197656","https://openalex.org/W7133239857"],"related_works":[],"abstract_inverted_index":{"Spatio-temporal":[0],"action":[1],"detection":[2],"remains":[3],"a":[4,57,82,102,124,186],"challenging":[5],"task":[6],"due":[7],"to":[8,88,110,130,160],"the":[9,90,154,162,191],"inherent":[10],"coupling":[11],"between":[12],"precise":[13],"spatial":[14,31,94,104,114,119],"localization":[15],"and":[16,37,153,174,178],"effective":[17],"temporal":[18,35,42,99,126,133,138],"modeling.":[19],"Although":[20],"recent":[21],"methods":[22],"have":[23],"achieved":[24],"promising":[25],"performance,":[26],"they":[27],"often":[28],"suffer":[29],"from":[30],"feature":[32,84],"degradation":[33],"during":[34],"aggregation":[36],"insufficient":[38],"modeling":[39],"of":[40,46,92,140,169,188],"heterogeneous":[41,125],"dynamics":[43],"across":[44,197],"actions":[45],"varying":[47],"durations.":[48],"Moreover,":[49],"most":[50],"existing":[51],"approaches":[52],"fuse":[53],"spatio-temporal":[54,74],"information":[55],"in":[56,63],"coarse":[58],"manner,":[59],"limiting":[60],"their":[61],"robustness":[62],"complex":[64],"scenes.":[65],"To":[66],"address":[67],"these":[68],"challenges,":[69],"we":[70,80,122],"propose":[71,123],"an":[72],"efficient":[73],"perception":[75],"network,":[76],"termed":[77],"STeP-Net.":[78],"Specifically,":[79],"introduce":[81],"frequency-aware":[83],"reconstruction":[85],"module":[86,106,128],"(FARM)":[87],"mitigate":[89],"loss":[91],"fine-grained":[93],"details":[95],"caused":[96],"by":[97],"multi-scale":[98],"processing.":[100],"Meanwhile,":[101],"directional":[103],"context":[105],"(DSCM)":[107],"is":[108],"designed":[109],"explicitly":[111],"model":[112],"local-global":[113],"dependencies":[115,134],"for":[116],"more":[117],"robust":[118],"representation.":[120],"Furthermore,":[121],"reasoning":[127],"(HTRM)":[129],"capture":[131],"long-range":[132],"while":[135],"accommodating":[136],"diverse":[137],"patterns":[139],"different":[141],"actions.":[142],"Extensive":[143],"experiments":[144],"on":[145,171,176,183],"three":[146],"benchmark":[147],"datasets,":[148],"including":[149],"AVA":[150,172],"2.2,":[151],"UCF101-24,":[152],"self-constructed":[155],"AVA-E":[156],"dataset,":[157],"are":[158],"conducted":[159],"evaluate":[161],"proposed":[163],"method.":[164],"STeP-Net":[165],"achieves":[166],"average":[167,181],"mAPs":[168],"18.2%":[170],"2.2":[173],"72.55%":[175],"AVA-E,":[177],"reaches":[179],"86.97%":[180],"mAP":[182],"UCF101-24":[184],"with":[185],"margin":[187],"1.87%":[189],"over":[190],"second-best":[192],"result,":[193],"indicating":[194],"favorable":[195],"performance":[196],"multiple":[198],"benchmarks.":[199]},"counts_by_year":[],"updated_date":"2026-06-16T07:37:23.134862","created_date":"2026-06-16T00:00:00"}
