{"id":"https://openalex.org/W4415708506","doi":"https://doi.org/10.1109/icme59968.2025.11210092","title":"Trans-Diff:Transformer-based Video Summarization with Diffusion","display_name":"Trans-Diff:Transformer-based Video Summarization with Diffusion","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708506","doi":"https://doi.org/10.1109/icme59968.2025.11210092"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11210092","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210092","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110522186","display_name":"Pan Cai","orcid":null},"institutions":[{"id":"https://openalex.org/I40963666","display_name":"Central China Normal University","ror":"https://ror.org/03x1jna21","country_code":"CN","type":"education","lineage":["https://openalex.org/I40963666"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Cai Pan","raw_affiliation_strings":["Central China Normal University,School of Computer Science,Wuhan,China"],"affiliations":[{"raw_affiliation_string":"Central China Normal University,School of Computer Science,Wuhan,China","institution_ids":["https://openalex.org/I40963666"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100350872","display_name":"Guowei Zhang","orcid":"https://orcid.org/0000-0001-6059-1947"},"institutions":[{"id":"https://openalex.org/I40963666","display_name":"Central China Normal University","ror":"https://ror.org/03x1jna21","country_code":"CN","type":"education","lineage":["https://openalex.org/I40963666"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guowei Zhang","raw_affiliation_strings":["Central China Normal University,School of Computer Science,Wuhan,China"],"affiliations":[{"raw_affiliation_string":"Central China Normal University,School of Computer Science,Wuhan,China","institution_ids":["https://openalex.org/I40963666"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101752928","display_name":"Rui Zhong","orcid":"https://orcid.org/0000-0003-0126-7543"},"institutions":[{"id":"https://openalex.org/I40963666","display_name":"Central China Normal University","ror":"https://ror.org/03x1jna21","country_code":"CN","type":"education","lineage":["https://openalex.org/I40963666"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Zhong","raw_affiliation_strings":["Central China Normal University,School of Computer Science,Wuhan,China"],"affiliations":[{"raw_affiliation_string":"Central China Normal University,School of Computer Science,Wuhan,China","institution_ids":["https://openalex.org/I40963666"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5110522186"],"corresponding_institution_ids":["https://openalex.org/I40963666"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.31103686,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.4381999969482422,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.4381999969482422,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.13279999792575836,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.12520000338554382,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/automatic-summarization","display_name":"Automatic summarization","score":0.8313000202178955},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6450999975204468},{"id":"https://openalex.org/keywords/salient","display_name":"Salient","score":0.5882999897003174},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5612999796867371},{"id":"https://openalex.org/keywords/coherence","display_name":"Coherence (philosophical gambling strategy)","score":0.5382999777793884},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.49869999289512634},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.4925999939441681},{"id":"https://openalex.org/keywords/limiting","display_name":"Limiting","score":0.4153999984264374},{"id":"https://openalex.org/keywords/diffusion-map","display_name":"Diffusion map","score":0.3659000098705292}],"concepts":[{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.8313000202178955},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7814000248908997},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6450999975204468},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6434999704360962},{"id":"https://openalex.org/C2780719617","wikidata":"https://www.wikidata.org/wiki/Q1030752","display_name":"Salient","level":2,"score":0.5882999897003174},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5612999796867371},{"id":"https://openalex.org/C2781181686","wikidata":"https://www.wikidata.org/wiki/Q4226068","display_name":"Coherence (philosophical gambling strategy)","level":2,"score":0.5382999777793884},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.49869999289512634},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4925999939441681},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4677000045776367},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.4153999984264374},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4146000146865845},{"id":"https://openalex.org/C55128770","wikidata":"https://www.wikidata.org/wiki/Q5275440","display_name":"Diffusion map","level":4,"score":0.3659000098705292},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3441999852657318},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.31310001015663147},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30889999866485596},{"id":"https://openalex.org/C202474056","wikidata":"https://www.wikidata.org/wiki/Q1931635","display_name":"Video tracking","level":3,"score":0.30880001187324524},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.2784000039100647},{"id":"https://openalex.org/C159620131","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Spatial analysis","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C2776151529","wikidata":"https://www.wikidata.org/wiki/Q3045304","display_name":"Object detection","level":3,"score":0.26820001006126404},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.25360000133514404},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.2535000145435333},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.25279998779296875},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11210092","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11210092","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2013035813","https://openalex.org/W2529272619","https://openalex.org/W2737677090","https://openalex.org/W2766630207","https://openalex.org/W2781922022","https://openalex.org/W2798970487","https://openalex.org/W2963919999","https://openalex.org/W2971495686","https://openalex.org/W2973229164","https://openalex.org/W3010790568","https://openalex.org/W3138217142","https://openalex.org/W3210314917","https://openalex.org/W4225769600","https://openalex.org/W4225925355","https://openalex.org/W4283399854","https://openalex.org/W4295037258","https://openalex.org/W4312933868","https://openalex.org/W4376607835","https://openalex.org/W4384161695","https://openalex.org/W4385245566","https://openalex.org/W4386065705","https://openalex.org/W4386257881","https://openalex.org/W4386453740","https://openalex.org/W4386553691","https://openalex.org/W4386596942","https://openalex.org/W4386598561","https://openalex.org/W4392215335","https://openalex.org/W4396680553"],"related_works":[],"abstract_inverted_index":{"Current":[0],"deep":[1],"neural":[2],"network-based":[3],"methods":[4],"often":[5],"struggle":[6],"to":[7,35,92],"handle":[8],"video-level":[9,38],"signals":[10,39],"comprehensively":[11,36],"and":[12,57,111,128],"establish":[13],"long-term":[14,52,109],"dependencies":[15,53],"within":[16],"videos,":[17],"consequently":[18],"limiting":[19],"the":[20,23,46,68,94,98,104,119,122,134],"quality":[21],"of":[22,71,136],"generated":[24,123],"summaries.":[25],"To":[26],"address":[27],"these":[28],"challenges,":[29],"we":[30,77],"utilize":[31],"transformer-based":[32,47,80],"temporal-spatial":[33,43,48,120],"encoder":[34,49],"represent":[37],"via":[40],"globally":[41],"combining":[42,118],"features.":[44],"Furthermore,":[45],"can":[50],"capture":[51],"between":[54],"consecutive":[55],"frames":[56],"obtain":[58],"salient":[59],"object":[60],"features":[61],"for":[62],"individual":[63],"frames,":[64],"respectively.":[65],"However,":[66],"considering":[67],"potential":[69],"oversight":[70],"local":[72],"information":[73,115],"during":[74,103],"global":[75],"modeling,":[76],"propose":[78],"a":[79,87],"video":[81],"summarization":[82],"with":[83],"diffusion":[84,88,99],"(Trans-Diff)":[85],"where":[86],"model":[89,100],"is":[90],"used":[91],"reconstruct":[93],"spatial":[95,114],"details.":[96],"Concurrently,":[97],"infers":[101],"frame-by-frame":[102],"reverse":[105],"process,":[106],"further":[107],"enhancing":[108],"dependencies,":[110],"effectively":[112],"handling":[113],"correlations.":[116],"By":[117],"features,":[121],"summaries":[124],"exhibit":[125],"enhanced":[126],"coherence":[127],"contextual":[129],"completeness.":[130],"Extensive":[131],"experiments":[132],"demonstrate":[133],"superiority":[135],"this":[137],"approach":[138],"over":[139],"state-of-the-art":[140],"methods.":[141]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-30T00:00:00"}
