{"id":"https://openalex.org/W4414539782","doi":"https://doi.org/10.23919/mva65244.2025.11175126","title":"Temporal Conditioning for Realistic Performance Video Generation from Instrumental Sounds","display_name":"Temporal Conditioning for Realistic Performance Video Generation from Instrumental Sounds","publication_year":2025,"publication_date":"2025-07-26","ids":{"openalex":"https://openalex.org/W4414539782","doi":"https://doi.org/10.23919/mva65244.2025.11175126"},"language":"en","primary_location":{"id":"doi:10.23919/mva65244.2025.11175126","is_oa":false,"landing_page_url":"https://doi.org/10.23919/mva65244.2025.11175126","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 19th International Conference on Machine Vision and Applications (MVA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5119739901","display_name":"Yusaku Nakanose","orcid":null},"institutions":[{"id":"https://openalex.org/I4387152983","display_name":"Osaka Metropolitan University","ror":"https://ror.org/01hvx5h04","country_code":null,"type":"education","lineage":["https://openalex.org/I4387152983"]},{"id":"https://openalex.org/I69740276","display_name":"Tokyo Metropolitan University","ror":"https://ror.org/00ws30h19","country_code":"JP","type":"education","lineage":["https://openalex.org/I69740276"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yusaku Nakanose","raw_affiliation_strings":["Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan"],"affiliations":[{"raw_affiliation_string":"Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan","institution_ids":["https://openalex.org/I69740276","https://openalex.org/I4387152983"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035539005","display_name":"C. Renato C. Nakagawa","orcid":"https://orcid.org/0000-0002-0432-4294"},"institutions":[{"id":"https://openalex.org/I69740276","display_name":"Tokyo Metropolitan University","ror":"https://ror.org/00ws30h19","country_code":"JP","type":"education","lineage":["https://openalex.org/I69740276"]},{"id":"https://openalex.org/I4387152983","display_name":"Osaka Metropolitan University","ror":"https://ror.org/01hvx5h04","country_code":null,"type":"education","lineage":["https://openalex.org/I4387152983"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Chito Nakagawa","raw_affiliation_strings":["Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan"],"affiliations":[{"raw_affiliation_string":"Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan","institution_ids":["https://openalex.org/I69740276","https://openalex.org/I4387152983"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027332963","display_name":"Katsufumi Inoue","orcid":"https://orcid.org/0000-0001-6073-7264"},"institutions":[{"id":"https://openalex.org/I4387152983","display_name":"Osaka Metropolitan University","ror":"https://ror.org/01hvx5h04","country_code":null,"type":"education","lineage":["https://openalex.org/I4387152983"]},{"id":"https://openalex.org/I69740276","display_name":"Tokyo Metropolitan University","ror":"https://ror.org/00ws30h19","country_code":"JP","type":"education","lineage":["https://openalex.org/I69740276"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Katsufumi Inoue","raw_affiliation_strings":["Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan"],"affiliations":[{"raw_affiliation_string":"Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan","institution_ids":["https://openalex.org/I69740276","https://openalex.org/I4387152983"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5108603127","display_name":"Michifumi Yoshioka","orcid":null},"institutions":[{"id":"https://openalex.org/I69740276","display_name":"Tokyo Metropolitan University","ror":"https://ror.org/00ws30h19","country_code":"JP","type":"education","lineage":["https://openalex.org/I69740276"]},{"id":"https://openalex.org/I4387152983","display_name":"Osaka Metropolitan University","ror":"https://ror.org/01hvx5h04","country_code":null,"type":"education","lineage":["https://openalex.org/I4387152983"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Michifumi Yoshioka","raw_affiliation_strings":["Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan"],"affiliations":[{"raw_affiliation_string":"Osaka Metropolitan University,Graduate School of Informatics,Osaka,Japan","institution_ids":["https://openalex.org/I69740276","https://openalex.org/I4387152983"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5119739901"],"corresponding_institution_ids":["https://openalex.org/I4387152983","https://openalex.org/I69740276"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34640214,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11349","display_name":"Music Technology and Sound Studies","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9805999994277954,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.4860000014305115},{"id":"https://openalex.org/keywords/temporal-resolution","display_name":"Temporal resolution","score":0.450300008058548},{"id":"https://openalex.org/keywords/dynamics","display_name":"Dynamics (music)","score":0.4059000015258789},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3849000036716461},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.3846000134944916},{"id":"https://openalex.org/keywords/temporal-database","display_name":"Temporal database","score":0.3732999861240387},{"id":"https://openalex.org/keywords/musical","display_name":"Musical","score":0.3621000051498413},{"id":"https://openalex.org/keywords/musical-instrument","display_name":"Musical instrument","score":0.34790000319480896}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7570000290870667},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.569599986076355},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5026999711990356},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.4860000014305115},{"id":"https://openalex.org/C119666444","wikidata":"https://www.wikidata.org/wiki/Q5977280","display_name":"Temporal resolution","level":2,"score":0.450300008058548},{"id":"https://openalex.org/C145912823","wikidata":"https://www.wikidata.org/wiki/Q113558","display_name":"Dynamics (music)","level":2,"score":0.4059000015258789},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3849000036716461},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3846000134944916},{"id":"https://openalex.org/C77277458","wikidata":"https://www.wikidata.org/wiki/Q1969246","display_name":"Temporal database","level":2,"score":0.3732999861240387},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.36079999804496765},{"id":"https://openalex.org/C2983311337","wikidata":"https://www.wikidata.org/wiki/Q34379","display_name":"Musical instrument","level":2,"score":0.34790000319480896},{"id":"https://openalex.org/C45262634","wikidata":"https://www.wikidata.org/wiki/Q5159291","display_name":"Conditioning","level":2,"score":0.334199994802475},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.30059999227523804},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C88485024","wikidata":"https://www.wikidata.org/wiki/Q1054571","display_name":"Cepstrum","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C25016198","wikidata":"https://www.wikidata.org/wiki/Q781833","display_name":"Temporal logic","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C172081034","wikidata":"https://www.wikidata.org/wiki/Q185961","display_name":"Time perception","level":3,"score":0.2694999873638153},{"id":"https://openalex.org/C177454536","wikidata":"https://www.wikidata.org/wiki/Q578290","display_name":"Emphasis (telecommunications)","level":2,"score":0.2667999863624573},{"id":"https://openalex.org/C2776449333","wikidata":"https://www.wikidata.org/wiki/Q7928781","display_name":"View synthesis","level":3,"score":0.26669999957084656}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.23919/mva65244.2025.11175126","is_oa":false,"landing_page_url":"https://doi.org/10.23919/mva65244.2025.11175126","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 19th International Conference on Machine Vision and Applications (MVA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W1901129140","https://openalex.org/W2759171953","https://openalex.org/W2963066677","https://openalex.org/W2963073614","https://openalex.org/W2963092440","https://openalex.org/W3097792222","https://openalex.org/W3115255359","https://openalex.org/W3160431667"],"related_works":[],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"a":[3,75],"novel":[4],"approach":[5,80,98],"for":[6,16],"generating":[7],"realistic":[8,119],"performance":[9,120],"videos":[10,121],"from":[11],"instrumental":[12],"sounds.":[13],"Previous":[14],"methods":[15,102],"audio-to-visual":[17],"generation":[18,77],"only":[19],"produce":[20],"static":[21],"images,":[22],"failing":[23],"to":[24],"capture":[25],"the":[26,60,92,116,125],"temporal":[27,41,57,66,88,109],"dynamics":[28],"inherent":[29],"in":[30,59,103],"musical":[31,129],"performances.":[32,130],"We":[33],"address":[34],"this":[35],"limitation":[36],"by":[37],"introducing":[38],"two":[39],"complementary":[40],"conditioning":[42],"mechanisms:":[43],"Time":[44,49],"Segment":[45,50],"Emphasis":[46],"(TSE)":[47],"and":[48,108],"Label":[51],"(TSL).":[52],"TSE":[53],"selectively":[54],"amplifies":[55],"critical":[56],"features":[58],"mel-spectrogram,":[61],"while":[62],"TSL":[63],"provides":[64],"explicit":[65],"guidance":[67],"through":[68],"binary":[69],"masks.":[70],"By":[71],"extending":[72],"CAR-GAN":[73],"into":[74],"multi-frame":[76],"system,":[78],"our":[79,97],"simultaneously":[81],"produces":[82],"four":[83],"consecutive":[84],"frames":[85],"that":[86,96,122],"maintain":[87],"coherence.":[89],"Experiments":[90],"on":[91],"URMP":[93],"dataset":[94],"demonstrate":[95],"significantly":[99],"outperforms":[100],"existing":[101],"both":[104],"visual":[105],"quality":[106],"(FID)":[107],"consistency":[110],"(FVD)":[111],"across":[112],"various":[113],"instruments,":[114],"enabling":[115],"synthesis":[117],"of":[118,128],"accurately":[123],"reflect":[124],"dynamic":[126],"nature":[127]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}
