{"id":"https://openalex.org/W4414938683","doi":"https://doi.org/10.1145/3728423.3759411","title":"Towards Robust Identity Incorporation in Sports Video Captioning Systems","display_name":"Towards Robust Identity Incorporation in Sports Video Captioning Systems","publication_year":2025,"publication_date":"2025-10-08","ids":{"openalex":"https://openalex.org/W4414938683","doi":"https://doi.org/10.1145/3728423.3759411"},"language":"en","primary_location":{"id":"doi:10.1145/3728423.3759411","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728423.3759411","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International ACM Workshop on Multimedia Content Analysis in Sports","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3728423.3759411","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114733935","display_name":"Karol Wojtulewicz","orcid":"https://orcid.org/0009-0007-0396-8921"},"institutions":[{"id":"https://openalex.org/I102134673","display_name":"Link\u00f6ping University","ror":"https://ror.org/05ynxx418","country_code":"SE","type":"education","lineage":["https://openalex.org/I102134673"]}],"countries":["SE"],"is_corresponding":true,"raw_author_name":"Karol Wojtulewicz","raw_affiliation_strings":["Link\u00f6ping University, Link\u00f6ping, Sweden"],"raw_orcid":"https://orcid.org/0009-0007-0396-8921","affiliations":[{"raw_affiliation_string":"Link\u00f6ping University, Link\u00f6ping, Sweden","institution_ids":["https://openalex.org/I102134673"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5077044865","display_name":"Niklas Carlsson","orcid":"https://orcid.org/0000-0003-1367-1594"},"institutions":[{"id":"https://openalex.org/I102134673","display_name":"Link\u00f6ping University","ror":"https://ror.org/05ynxx418","country_code":"SE","type":"education","lineage":["https://openalex.org/I102134673"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Niklas Carlsson","raw_affiliation_strings":["Link\u00f6ping University, Link\u00f6ping, Sweden"],"raw_orcid":"https://orcid.org/0000-0003-1367-1594","affiliations":[{"raw_affiliation_string":"Link\u00f6ping University, Link\u00f6ping, Sweden","institution_ids":["https://openalex.org/I102134673"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5114733935"],"corresponding_institution_ids":["https://openalex.org/I102134673"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.2586397,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"18","last_page":"30"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8363000154495239},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5874000191688538},{"id":"https://openalex.org/keywords/identity","display_name":"Identity (music)","score":0.4832000136375427},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.45829999446868896},{"id":"https://openalex.org/keywords/modality","display_name":"Modality (human\u2013computer interaction)","score":0.4032000005245209},{"id":"https://openalex.org/keywords/comprehension","display_name":"Comprehension","score":0.367900013923645},{"id":"https://openalex.org/keywords/identifier","display_name":"Identifier","score":0.336899995803833},{"id":"https://openalex.org/keywords/interpretation","display_name":"Interpretation (philosophy)","score":0.33500000834465027}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8363000154495239},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7660999894142151},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5874000191688538},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.5023999810218811},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.484499990940094},{"id":"https://openalex.org/C2778355321","wikidata":"https://www.wikidata.org/wiki/Q17079427","display_name":"Identity (music)","level":2,"score":0.4832000136375427},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.45829999446868896},{"id":"https://openalex.org/C2780226545","wikidata":"https://www.wikidata.org/wiki/Q6888030","display_name":"Modality (human\u2013computer interaction)","level":2,"score":0.4032000005245209},{"id":"https://openalex.org/C511192102","wikidata":"https://www.wikidata.org/wiki/Q5156948","display_name":"Comprehension","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C154504017","wikidata":"https://www.wikidata.org/wiki/Q853614","display_name":"Identifier","level":2,"score":0.336899995803833},{"id":"https://openalex.org/C527412718","wikidata":"https://www.wikidata.org/wiki/Q855395","display_name":"Interpretation (philosophy)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.33329999446868896},{"id":"https://openalex.org/C2987834672","wikidata":"https://www.wikidata.org/wiki/Q4677630","display_name":"Action recognition","level":3,"score":0.31060001254081726},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29829999804496765},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.2897000014781952},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C2777096784","wikidata":"https://www.wikidata.org/wiki/Q3826351","display_name":"Referent","level":2,"score":0.28130000829696655},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.2800999879837036},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.26899999380111694},{"id":"https://openalex.org/C56461940","wikidata":"https://www.wikidata.org/wiki/Q970687","display_name":"Eye tracking","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C2777402240","wikidata":"https://www.wikidata.org/wiki/Q6783436","display_name":"Masking (illustration)","level":2,"score":0.25279998779296875}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3728423.3759411","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728423.3759411","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International ACM Workshop on Multimedia Content Analysis in Sports","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3728423.3759411","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3728423.3759411","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 8th International ACM Workshop on Multimedia Content Analysis in Sports","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":20,"referenced_works":["https://openalex.org/W1586939924","https://openalex.org/W2089515781","https://openalex.org/W2798793675","https://openalex.org/W2899463613","https://openalex.org/W2914699769","https://openalex.org/W2935104927","https://openalex.org/W2963073614","https://openalex.org/W2963524571","https://openalex.org/W3010593057","https://openalex.org/W3022778813","https://openalex.org/W3035635319","https://openalex.org/W3053202119","https://openalex.org/W3109395208","https://openalex.org/W3207126306","https://openalex.org/W4288089799","https://openalex.org/W4312425089","https://openalex.org/W4312508181","https://openalex.org/W4312956563","https://openalex.org/W4386057769","https://openalex.org/W4402727919"],"related_works":[],"abstract_inverted_index":{"Sports":[0],"video":[1,38,157],"analysis":[2],"is":[3],"a":[4,28,183],"rapidly":[5],"growing":[6],"field.":[7],"Yet,":[8],"identity-aware,":[9],"temporally":[10],"grounded":[11],"captioning,":[12,41],"linking":[13],"global":[14,76],"player":[15,33,45,72,124,165],"identities":[16],"across":[17],"fast,":[18],"multi-agent":[19],"interactions,":[20],"remains":[21],"underexplored.":[22],"We":[23],"address":[24],"this":[25],"gap":[26],"with":[27],"novel":[29],"method":[30,145],"for":[31,75,155,163,186],"integrating":[32],"identity":[34,77],"into":[35,175],"multimodal":[36,136],"sports":[37,193],"models,":[39],"improving":[40],"action":[42,64],"understanding,":[43],"and":[44,66,89,105,110,123,160,172,189],"recognition.":[46],"Our":[47],"approach":[48],"(1)":[49],"employs":[50],"triangular":[51],"attention":[52],"masking":[53],"within":[54],"modality":[55],"encoders,":[56],"capturing":[57],"temporal":[58,132],"inductive":[59],"biases":[60],"to":[61,82,86,99,118,169],"better":[62],"model":[63,81],"sequences":[65],"their":[67],"causal":[68],"flow,":[69],"(2)":[70],"proposes":[71],"token":[73],"injection":[74],"grounding,":[78],"enabling":[79],"the":[80,101,131,176],"connect":[83],"visual":[84],"observations":[85],"named":[87],"individuals,":[88],"(3)":[90],"simulates":[91],"ball":[92],"possession":[93],"sequences,":[94],"mimicking":[95],"real-world":[96],"tracking":[97],"data":[98],"strengthen":[100],"link":[102],"between":[103],"actions":[104],"involved":[106],"players.":[107],"Both":[108],"combined":[109],"individually,":[111],"each":[112],"of":[113,191],"these":[114],"components":[115],"allows":[116],"us":[117],"significantly":[119],"improve":[120],"caption":[121],"quality":[122],"classification":[125],"accuracy,":[126],"as":[127,129],"well":[128],"enhance":[130],"comprehension":[133],"in":[134],"our":[135,144],"data.":[137],"Using":[138],"extensive":[139],"experimentation,":[140],"we":[141],"show":[142],"that":[143],"achieves":[146],"substantial":[147],"improvements":[148],"over":[149,161],"prior":[150],"work":[151],"(e.g.,":[152],"up-to":[153],"225%":[154],"some":[156,164],"captioning":[158],"tasks":[159],"14\u00d7":[162],"recognition":[166],"tasks),":[167],"generalize":[168],"other":[170],"domains,":[171],"provide":[173],"insights":[174],"best":[177],"design":[178],"tradeoffs.":[179],"The":[180],"results":[181],"highlight":[182],"promising":[184],"avenue":[185],"automated":[187],"understanding":[188],"interpretation":[190],"dynamic":[192],"content.":[194]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
