{"id":"https://openalex.org/W7160280113","doi":"https://doi.org/10.48550/arxiv.2605.01809","title":"TMD-Bench: A Multi-Level Evaluation Paradigm for Music-Dance Co-Generation","display_name":"TMD-Bench: A Multi-Level Evaluation Paradigm for Music-Dance Co-Generation","publication_year":2026,"publication_date":"2026-05-03","ids":{"openalex":"https://openalex.org/W7160280113","doi":"https://doi.org/10.48550/arxiv.2605.01809"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.01809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.01809","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135411250","display_name":"Xiaoda Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xiaoda","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036840817","display_name":"Majun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Majun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103228451","display_name":"Changhao Pan","orcid":"https://orcid.org/0009-0004-6023-1764"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pan, Changhao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135413400","display_name":"Nick Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Nick","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106663302","display_name":"\u6768\u5b87\u5149 Yang Yuguang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuguang, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101036892","display_name":"Fan Zhuo","orcid":"https://orcid.org/0009-0009-2324-0975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuo, Fan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135349327","display_name":"Pengfei Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Pengfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135287998","display_name":"Jin Zhou","orcid":"https://orcid.org/0000-0003-2993-6235"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101198358","display_name":"Sizhe Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Sizhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135315289","display_name":"Shan Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Shan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135402710","display_name":"Miles Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Miles","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135316582","display_name":"Yang You","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"You, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135417661","display_name":"Zhou Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.3625999987125397,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.3625999987125397,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.34220001101493835,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.08730000257492065,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.7912999987602234},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6258999705314636},{"id":"https://openalex.org/keywords/rhythm","display_name":"Rhythm","score":0.6104000210762024},{"id":"https://openalex.org/keywords/baseline","display_name":"Baseline (sea)","score":0.569100022315979},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.5674999952316284},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5654000043869019},{"id":"https://openalex.org/keywords/motion-capture","display_name":"Motion capture","score":0.4251999855041504},{"id":"https://openalex.org/keywords/production","display_name":"Production (economics)","score":0.36160001158714294}],"concepts":[{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.7912999987602234},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6547999978065491},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6258999705314636},{"id":"https://openalex.org/C135343436","wikidata":"https://www.wikidata.org/wiki/Q170406","display_name":"Rhythm","level":2,"score":0.6104000210762024},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.569100022315979},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.5674999952316284},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5654000043869019},{"id":"https://openalex.org/C48007421","wikidata":"https://www.wikidata.org/wiki/Q676252","display_name":"Motion capture","level":3,"score":0.4251999855041504},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3840999901294708},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.36160001158714294},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.359499990940094},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.3222000002861023},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.3221000134944916},{"id":"https://openalex.org/C131584629","wikidata":"https://www.wikidata.org/wiki/Q4308705","display_name":"Coupling (piping)","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3061000108718872},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.2928999960422516},{"id":"https://openalex.org/C2777946086","wikidata":"https://www.wikidata.org/wiki/Q1163335","display_name":"Music information retrieval","level":3,"score":0.28780001401901245},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C45493050","wikidata":"https://www.wikidata.org/wiki/Q7884934","display_name":"Unified Model","level":2,"score":0.2768999934196472},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27649998664855957},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.27090001106262207},{"id":"https://openalex.org/C2781215313","wikidata":"https://www.wikidata.org/wiki/Q3493345","display_name":"SPARK (programming language)","level":2,"score":0.2648000121116638}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.01809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.01809","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.01809","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"public-domain","license_id":"https://openalex.org/licenses/public-domain","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Unified":[0],"audio-visual":[1,123],"generation":[2,80],"is":[3,50,99],"rapidly":[4],"gaining":[5],"industrial":[6],"and":[7,15,36,46,84,98,107,129,135,144,149,180],"creative":[8],"relevance,":[9],"enabling":[10],"applications":[11],"in":[12,62],"virtual":[13],"production":[14],"interactive":[16],"media.":[17],"However,":[18],"when":[19],"moving":[20],"from":[21],"general":[22],"audio-video":[23],"synthesis":[24],"to":[25],"music-dance":[26,73,105,174],"co-generation,":[27],"the":[28],"task":[29],"becomes":[30],"substantially":[31],"harder:":[32],"musical":[33],"rhythm,":[34],"phrasing,":[35],"accents":[37],"must":[38],"drive":[39],"choreographic":[40],"motion":[41],"at":[42],"fine":[43],"temporal":[44],"resolution,":[45],"such":[47,125],"rhythmic":[48,86,138,179],"coupling":[49,139],"not":[51],"captured":[52],"by":[53,101],"unimodal":[54,79,166],"metrics":[55,93],"or":[56],"generic":[57],"audiovisual":[58],"consistency":[59],"scores":[60],"used":[61],"current":[63],"evaluation":[64],"practice.":[65],"We":[66],"introduce":[67],"TMD-Bench,":[68],"a":[69,102,108],"benchmark":[70,89],"for":[71,112,147,171],"text-driven":[72],"co-generation":[74],"that":[75,119,176],"assesses":[76],"systems":[77],"across":[78],"quality,":[81],"instruction":[82],"adherence,":[83],"cross-modal":[85],"alignment.":[87],"The":[88],"integrates":[90],"computable":[91],"physical":[92],"with":[94],"perceptual":[95],"multimodal":[96],"judgments,":[97],"supported":[100],"curated":[103],"rhythm-aligned":[104,157],"dataset":[106],"fine-grained":[109],"Music":[110],"Captioner":[111],"structured":[113],"music":[114,134],"semantics.":[115],"TMD-Bench":[116],"further":[117],"reveals":[118],"(i)":[120],"modern":[121],"commercial":[122],"models,":[124],"as":[126],"Veo":[127],"3":[128],"Sora":[130],"2,":[131],"produce":[132],"high-quality":[133],"video,":[136],"while":[137,163],"remains":[140],"less":[141],"consistently":[142],"optimized":[143],"leaves":[145],"room":[146],"improvement,":[148],"(ii)":[150],"our":[151],"unified":[152],"baseline":[153],"RhyJAM":[154],"trained":[155],"on":[156],"data":[158],"achieves":[159],"competitive":[160,165],"beat-level":[161],"synchronization":[162],"maintaining":[164],"fidelity.":[167],"This":[168],"presents":[169],"prospects":[170],"building":[172],"next-generation":[173],"models":[175],"explicitly":[177],"optimize":[178],"kinetic":[181],"coherence.":[182]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-06T00:00:00"}
