{"id":"https://openalex.org/W7129627662","doi":"https://doi.org/10.1109/icipw68931.2025.11385906","title":"Semanticbox: Bounding Box-Guided Caption Enhanced Action Recognition for Instructional Videos","display_name":"Semanticbox: Bounding Box-Guided Caption Enhanced Action Recognition for Instructional Videos","publication_year":2025,"publication_date":"2025-09-14","ids":{"openalex":"https://openalex.org/W7129627662","doi":"https://doi.org/10.1109/icipw68931.2025.11385906"},"language":null,"primary_location":{"id":"doi:10.1109/icipw68931.2025.11385906","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icipw68931.2025.11385906","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Image Processing Workshops (ICIPW)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126186530","display_name":"Jonathan McGee","orcid":null},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jonathan McGee","raw_affiliation_strings":["University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032685001","display_name":"Chongyu He","orcid":null},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chongyu He","raw_affiliation_strings":["University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081562360","display_name":"Peter Youngs","orcid":"https://orcid.org/0000-0002-1711-1749"},"institutions":[{"id":"https://openalex.org/I1296638804","display_name":"Virginia Department of Education","ror":"https://ror.org/05jnjmr07","country_code":"US","type":"government","lineage":["https://openalex.org/I1296638804"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Peter Youngs","raw_affiliation_strings":["University of Virginia,Department of Curriculum, Instruction, and Special Education,VA,USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia,Department of Curriculum, Instruction, and Special Education,VA,USA","institution_ids":["https://openalex.org/I1296638804"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126240518","display_name":"Scott T. Acton","orcid":null},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Scott T. Acton","raw_affiliation_strings":["University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA","institution_ids":["https://openalex.org/I175594653"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5041300663","display_name":"Matthew Korban","orcid":"https://orcid.org/0009-0008-4610-9390"},"institutions":[{"id":"https://openalex.org/I175594653","display_name":"John Brown University","ror":"https://ror.org/02ct41q97","country_code":"US","type":"education","lineage":["https://openalex.org/I175594653"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matthew Korban","raw_affiliation_strings":["University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"],"affiliations":[{"raw_affiliation_string":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA","institution_ids":["https://openalex.org/I175594653"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5126186530"],"corresponding_institution_ids":["https://openalex.org/I175594653"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.74339756,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"333","last_page":"338"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.880299985408783,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.880299985408783,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.08060000091791153,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11398","display_name":"Hand Gesture Recognition Systems","score":0.004399999976158142,"subfield":{"id":"https://openalex.org/subfields/1709","display_name":"Human-Computer Interaction"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/action-recognition","display_name":"Action recognition","score":0.7214999794960022},{"id":"https://openalex.org/keywords/bounding-overwatch","display_name":"Bounding overwatch","score":0.6758999824523926},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.6302000284194946},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5134000182151794},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.5020999908447266},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.492000013589859},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.41029998660087585},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4092000126838684},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.40400001406669617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8212000131607056},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.7214999794960022},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6937999725341797},{"id":"https://openalex.org/C63584917","wikidata":"https://www.wikidata.org/wiki/Q333286","display_name":"Bounding overwatch","level":2,"score":0.6758999824523926},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.6302000284194946},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5134000182151794},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.5020999908447266},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.492000013589859},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.41029998660087585},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4092000126838684},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.40400001406669617},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3919999897480011},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.38510000705718994},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3637999892234802},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.33959999680519104},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.31850001215934753},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.31060001254081726},{"id":"https://openalex.org/C121687571","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Activity recognition","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C147037132","wikidata":"https://www.wikidata.org/wiki/Q6865426","display_name":"Minimum bounding box","level":3,"score":0.29649999737739563},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.296099990606308},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.2851000130176544},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25839999318122864},{"id":"https://openalex.org/C159437735","wikidata":"https://www.wikidata.org/wiki/Q1519524","display_name":"Gesture recognition","level":3,"score":0.2572000026702881},{"id":"https://openalex.org/C2780660688","wikidata":"https://www.wikidata.org/wiki/Q25052564","display_name":"Multimodal learning","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icipw68931.2025.11385906","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icipw68931.2025.11385906","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Image Processing Workshops (ICIPW)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.8375077843666077,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W2611596598","https://openalex.org/W2962677013","https://openalex.org/W3175227519","https://openalex.org/W3203086442","https://openalex.org/W4304014690","https://openalex.org/W4312376880","https://openalex.org/W4312480274","https://openalex.org/W4312509322","https://openalex.org/W4386072441","https://openalex.org/W4392827338","https://openalex.org/W4393148669","https://openalex.org/W4402660140","https://openalex.org/W4407056550"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"action":[1,35,65],"recognition":[2,36,66],"within":[3],"complex":[4,108],"scenes":[5],"requires":[6],"a":[7,30,51,80,107],"comprehensive":[8],"understanding":[9],"of":[10,21],"the":[11,16,22,56,63,75,92],"entire":[12],"scene,":[13],"encompassing":[14],"both":[15],"visual":[17,71],"and":[18,42,96,123,128,133],"audio":[19],"aspects":[20],"video.":[23],"Contrastive":[24],"Learning":[25],"Image":[26],"Pretraining":[27],"(CLIP)":[28],"is":[29,84],"well-known":[31],"backbone":[32],"for":[33],"multi-modal":[34],"tasks":[37],"as":[38],"seen":[39],"in":[40,126],"ActionCLIP":[41],"its":[43],"variants.":[44],"However,":[45],"these":[46],"models":[47,122],"are":[48],"subject":[49],"to":[50,69,86,119],"major":[52],"weakness:":[53],"overemphasis":[54],"on":[55,106],"background.":[57],"SemanticBox":[58,102],"integrates":[59],"bounding":[60],"boxes":[61],"into":[62],"video":[64,110],"CLIP-style":[67],"paradigm":[68],"add":[70],"clues":[72],"that":[73],"boost":[74],"model\u2019s":[76],"classification":[77],"performance.":[78],"Additionally,":[79],"pretrained":[81],"generative":[82],"classifier":[83],"added":[85],"provide":[87],"rich":[88],"frame":[89],"descriptions,":[90],"enhancing":[91],"textual":[93],"feature":[94],"semantics":[95],"offering":[97],"an":[98],"additional":[99],"performance":[100,105],"boost.":[101],"achieves":[103],"impressive":[104],"instructional":[109],"dataset":[111],"characterized":[112],"by":[113],"background":[114],"clutter,":[115],"achieving":[116],"comparable":[117],"Recall@2":[118],"state-of-the-art":[120],"CLIP-based":[121],"outperforming":[124],"them":[125],"Top-1":[127],"Top-2":[129],"accuracy,":[130],"F1":[131],"score,":[132],"mean":[134],"average":[135],"precision":[136],"(mAP).":[137]},"counts_by_year":[],"updated_date":"2026-02-19T06:27:42.648592","created_date":"2026-02-18T00:00:00"}
