{"id":"https://openalex.org/W4415537327","doi":"https://doi.org/10.1145/3746027.3754808","title":"M2PE-Diff: Music-to-Pose Encoder for Dance Video Generation Leveraging Latent Diffusion Framework","display_name":"M2PE-Diff: Music-to-Pose Encoder for Dance Video Generation Leveraging Latent Diffusion Framework","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415537327","doi":"https://doi.org/10.1145/3746027.3754808"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3754808","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754808","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026245163","display_name":"Nokap Park","orcid":"https://orcid.org/0000-0001-7971-9461"},"institutions":[{"id":"https://openalex.org/I4210149944","display_name":"Korea Telecom (South Korea)","ror":null,"country_code":"KR","type":null,"lineage":["https://openalex.org/I4210149944"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Nokap Tony Park","raw_affiliation_strings":["SK Telecom, Seoul, Seoul, Republic of Korea"],"affiliations":[{"raw_affiliation_string":"SK Telecom, Seoul, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I4210149944"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5026245163"],"corresponding_institution_ids":["https://openalex.org/I4210149944"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.38588916,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2419","last_page":"2428"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9950000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dance","display_name":"Dance","score":0.6685000061988831},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6668999791145325},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5834000110626221},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5257999897003174},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.4925999939441681},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.4893999993801117},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.4011000096797943},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.39969998598098755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883999943733215},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6844000220298767},{"id":"https://openalex.org/C147446459","wikidata":"https://www.wikidata.org/wiki/Q11639","display_name":"Dance","level":2,"score":0.6685000061988831},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6668999791145325},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.632099986076355},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5834000110626221},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5257999897003174},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4925999939441681},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.4893999993801117},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.4011000096797943},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.39969998598098755},{"id":"https://openalex.org/C183920142","wikidata":"https://www.wikidata.org/wiki/Q180856","display_name":"Choreography","level":3,"score":0.361299991607666},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3587000072002411},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.34619998931884766},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3456000089645386},{"id":"https://openalex.org/C207347870","wikidata":"https://www.wikidata.org/wiki/Q371174","display_name":"Gesture","level":2,"score":0.3190000057220459},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C82990744","wikidata":"https://www.wikidata.org/wiki/Q166194","display_name":"RGB color model","level":2,"score":0.27549999952316284},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.2599000036716461},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.25519999861717224},{"id":"https://openalex.org/C34413123","wikidata":"https://www.wikidata.org/wiki/Q170978","display_name":"Robotics","level":3,"score":0.2515000104904175},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3754808","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3754808","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5913837872","display_name":null,"funder_award_id":"RS-2024-00398536","funder_id":"https://openalex.org/F4320323890","funder_display_name":"Korea Creative Content Agency"}],"funders":[{"id":"https://openalex.org/F4320323890","display_name":"Korea Creative Content Agency","ror":"https://ror.org/036vyg793"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1967554269","https://openalex.org/W2133665775","https://openalex.org/W2908510526","https://openalex.org/W2962785568","https://openalex.org/W2963524571","https://openalex.org/W2963845150","https://openalex.org/W3189935790","https://openalex.org/W4206861281","https://openalex.org/W4287802874","https://openalex.org/W4309797367","https://openalex.org/W4312933868","https://openalex.org/W4321337794","https://openalex.org/W4385473553","https://openalex.org/W4388964370","https://openalex.org/W4393147998","https://openalex.org/W4399400925","https://openalex.org/W6947998850"],"related_works":[],"abstract_inverted_index":{"Automated":[0],"choreography":[1],"generation,":[2],"which":[3,63],"aims":[4],"to":[5],"seamlessly":[6],"harmonize":[7],"human":[8,31,76],"movements":[9],"with":[10,55,83,91],"music,":[11],"is":[12],"a":[13,26,36,43,50,56,92,96,157],"multifaceted":[14],"challenge":[15],"demanding":[16],"both":[17],"technical":[18],"precision":[19],"and":[20,39,71,78,109,141],"artistic":[21],"expressiveness.":[22],"We":[23],"present":[24],"M2PE-DIFF,":[25],"novel":[27,57],"framework":[28],"for":[29],"generating":[30],"dance":[32,112,134],"videos":[33,135],"conditioned":[34],"on":[35,123,156],"reference":[37,93,120],"image":[38,94],"music":[40],"sequence":[41],"using":[42],"latent":[44,102],"diffusion":[45,103],"model.":[46],"Our":[47],"approach":[48],"integrates":[49],"Music-to-Pose":[51],"Encoder":[52],"(M2PEnc),":[53],"trained":[54],"synthetic":[58],"dataset":[59],"generation":[60],"pipeline":[61],"(SDGPip),":[62],"maps":[64],"audio":[65],"features":[66],"into":[67],"structured":[68],"3D":[69],"pose":[70,139],"shape":[72],"parameters":[73,90],"that":[74,127,136],"capture":[75],"geometry":[77],"dynamic":[79],"motion":[80],"patterns":[81],"synchronized":[82,111],"musical":[84],"input.":[85],"By":[86],"combining":[87],"these":[88],"encoded":[89],"through":[95],"multi-level":[97],"attention":[98],"mechanism":[99],"within":[100],"the":[101,118],"framework,":[104],"we":[105],"synthesize":[106],"visually":[107],"coherent":[108],"rhythmically":[110],"animations":[113],"of":[114],"individuals":[115],"depicted":[116],"in":[117],"given":[119],"image.":[121],"Experiments":[122],"benchmark":[124],"datasets":[125],"demonstrate":[126],"M2PE-DIFF":[128],"achieves":[129],"state-of-the-art":[130],"performance,":[131],"producing":[132],"high-quality":[133],"accurately":[137],"reflect":[138],"diversity":[140],"temporal":[142],"consistency.":[143],"Additionally,":[144],"our":[145],"method":[146],"exhibits":[147],"robust":[148],"generalization":[149],"capabilities,":[150],"validated":[151],"by":[152],"its":[153],"strong":[154],"performance":[155],"newly":[158],"introduced":[159],"in-the-wild":[160],"dataset.":[161]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-25T00:00:00"}
