{"id":"https://openalex.org/W7150854764","doi":"https://doi.org/10.1145/3793853.3795749","title":"MARs: Multi-Scale Convolution-Attention residual Fusion for Video Summarization","display_name":"MARs: Multi-Scale Convolution-Attention residual Fusion for Video Summarization","publication_year":2026,"publication_date":"2026-04-04","ids":{"openalex":"https://openalex.org/W7150854764","doi":"https://doi.org/10.1145/3793853.3795749"},"language":null,"primary_location":{"id":"doi:10.1145/3793853.3795749","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3793853.3795749","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Multimedia Systems Conference 2026","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3793853.3795749","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133058177","display_name":"Joon-Seok Song","orcid":null},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Joon-Seok Song","raw_affiliation_strings":["Sungkyunkwan University, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0006-8905-7063","affiliations":[{"raw_affiliation_string":"Sungkyunkwan University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024822335","display_name":"Juyeob Lee","orcid":"https://orcid.org/0000-0002-0686-1712"},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Juyeob Lee","raw_affiliation_strings":["Sungkyunkwan University, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0002-0686-1712","affiliations":[{"raw_affiliation_string":"Sungkyunkwan University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I848706"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047279790","display_name":"Eunil Park","orcid":"https://orcid.org/0000-0002-3177-3538"},"institutions":[{"id":"https://openalex.org/I848706","display_name":"Sungkyunkwan University","ror":"https://ror.org/04q78tk20","country_code":"KR","type":"education","lineage":["https://openalex.org/I848706"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Eunil Park","raw_affiliation_strings":["Sungkyunkwan University, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0002-3177-3538","affiliations":[{"raw_affiliation_string":"Sungkyunkwan University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I848706"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5133058177"],"corresponding_institution_ids":["https://openalex.org/I848706"],"apc_list":null,"apc_paid":null,"fwci":32.4097,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.99556141,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"84","last_page":"95"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9556000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9556000232696533,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.006899999920278788,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.0066999997943639755,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.9085999727249146},{"id":"https://openalex.org/keywords/convolutional-neural-network","display_name":"Convolutional neural network","score":0.6794999837875366},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6546000242233276},{"id":"https://openalex.org/keywords/residual","display_name":"Residual","score":0.6182000041007996},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4478999972343445},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.40849998593330383},{"id":"https://openalex.org/keywords/feature-extraction","display_name":"Feature extraction","score":0.3959999978542328}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.9085999727249146},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8220999836921692},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.6794999837875366},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6687999963760376},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6546000242233276},{"id":"https://openalex.org/C155512373","wikidata":"https://www.wikidata.org/wiki/Q287450","display_name":"Residual","level":2,"score":0.6182000041007996},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4478999972343445},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.40849998593330383},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3959999978542328},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.35740000009536743},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.3359000086784363},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32850000262260437},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3098999857902527},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.29339998960494995},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C157899210","wikidata":"https://www.wikidata.org/wiki/Q1395022","display_name":"Convolutional code","level":3,"score":0.2540000081062317},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3793853.3795749","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3793853.3795749","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Multimedia Systems Conference 2026","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3793853.3795749","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3793853.3795749","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Multimedia Systems Conference 2026","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W1979354511","https://openalex.org/W3099156605","https://openalex.org/W3123845245","https://openalex.org/W3210592264","https://openalex.org/W4225769600","https://openalex.org/W4376607835","https://openalex.org/W4386453740","https://openalex.org/W4398223129","https://openalex.org/W4401866943","https://openalex.org/W4408889520","https://openalex.org/W4414112109"],"related_works":[],"abstract_inverted_index":{"Video":[0],"summarization":[1,65,72,131],"aims":[2],"to":[3,31,40,50,62,128],"selectively":[4],"extract":[5],"important":[6],"events":[7],"and":[8,14,94,123],"contexts":[9],"embedded":[10],"in":[11],"a":[12,18,69],"video":[13,20,71,110],"reconstruct":[15],"them":[16],"into":[17],"concise":[19],"while":[21,101],"preserving":[22],"the":[23,74,108,130,141,151,157,160],"original":[24],"content.":[25],"Previous":[26],"studies":[27,148],"employed":[28],"attention-based":[29,112],"approaches":[30,48],"capture":[32],"long-term":[33],"global":[34,55,104],"dependencies":[35],"or":[36,56],"convolutional":[37,82,99],"neural":[38],"networks":[39],"learn":[41],"local":[42,57,96],"frame-level":[43],"patterns":[44],"separately.":[45],"However,":[46],"these":[47],"tend":[49],"be":[51],"biased":[52],"toward":[53],"either":[54],"information,":[58],"limiting":[59],"their":[60],"ability":[61],"achieve":[63],"high":[64],"performance.":[66,132],"We":[67],"propose":[68],"new":[70],"architecture,":[73],"Multi-Scale":[75],"Convolution-Attention":[76],"Residual":[77],"Fusion":[78],"(MARs),":[79],"which":[80],"integrates":[81],"modules":[83,100],"with":[84],"multi-head":[85],"self-attention":[86],"mechanisms.":[87],"The":[88],"proposed":[89,142,161],"model":[90,143],"captures":[91],"inter-frame":[92],"variations":[93],"fine-grained":[95],"features":[97],"through":[98],"simultaneously":[102],"learning":[103],"contextual":[105],"information":[106],"across":[107],"entire":[109],"using":[111],"modules.":[113],"Furthermore,":[114],"additional":[115],"designs,":[116],"such":[117],"as":[118],"temporal-difference":[119],"embedding,":[120],"multi-scale":[121],"convolution,":[122],"positional":[124],"encoding,":[125],"are":[126,164],"incorporated":[127],"enhance":[129],"Experimental":[133],"results":[134],"on":[135],"two":[136],"benchmark":[137],"datasets":[138],"demonstrate":[139],"that":[140],"outperforms":[144],"existing":[145],"methods.":[146],"Ablation":[147],"further":[149],"validate":[150],"contribution":[152],"of":[153,159],"each":[154],"module,":[155],"confirming":[156],"effectiveness":[158],"architecture.":[162],"Codes":[163],"available":[165],"at":[166],"https://github.com/dxlabskku/MARs.":[167]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-07T06:06:30.997549","created_date":"2026-04-07T00:00:00"}
