{"id":"https://openalex.org/W7164807634","doi":"https://doi.org/10.1145/3805622.3810789","title":"From Physics to Representation: Audio Learning with Synthetic Pre-training via Procedural Generation","display_name":"From Physics to Representation: Audio Learning with Synthetic Pre-training via Procedural Generation","publication_year":2026,"publication_date":"2026-06-15","ids":{"openalex":"https://openalex.org/W7164807634","doi":"https://doi.org/10.1145/3805622.3810789"},"language":null,"primary_location":{"id":"doi:10.1145/3805622.3810789","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810789","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3805622.3810789","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027115115","display_name":"F Liu","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I21945476","display_name":"Shanghai Normal University","ror":"https://ror.org/01cxqmw89","country_code":"CN","type":"education","lineage":["https://openalex.org/I21945476"]},{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fengrui Liu","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, shanghai, shanghai, China","School of Psychology, Shanghai Jiao Tong University, shanghai, shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-5646-990X","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I66867065","https://openalex.org/I21945476"]},{"raw_affiliation_string":"School of Psychology, Shanghai Jiao Tong University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138683783","display_name":"Ruiyang Huang","orcid":"https://orcid.org/0009-0007-0178-8572"},"institutions":[{"id":"https://openalex.org/I4210090971","display_name":"Southeast University","ror":"https://ror.org/00cf0ab87","country_code":"BD","type":"education","lineage":["https://openalex.org/I4210090971"]},{"id":"https://openalex.org/I76569877","display_name":"Southeast University","ror":"https://ror.org/04ct4d772","country_code":"CN","type":"education","lineage":["https://openalex.org/I76569877"]}],"countries":["BD","CN"],"is_corresponding":false,"raw_author_name":"Ruiyang Huang","raw_affiliation_strings":["School of Cyber Science and Engineering, Southeast University, nanjing, China"],"raw_orcid":"https://orcid.org/0009-0007-0178-8572","affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Southeast University, nanjing, China","institution_ids":["https://openalex.org/I4210090971","https://openalex.org/I76569877"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5138645921","display_name":"Qijian Zheng","orcid":"https://orcid.org/0000-0002-6462-5400"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qijian Zheng","raw_affiliation_strings":["College of Computer Science and Artificial Intelligence, Fudan University, shanghai, shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-6462-5400","affiliations":[{"raw_affiliation_string":"College of Computer Science and Artificial Intelligence, Fudan University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077451740","display_name":"Yuanfang Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanfang Wang","raw_affiliation_strings":["Global College, Shanghai Jiao Tong University, shanghai, shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-8774-4732","affiliations":[{"raw_affiliation_string":"Global College, Shanghai Jiao Tong University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085891781","display_name":"Feng Liu","orcid":"https://orcid.org/0000-0002-5289-5761"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I21945476","display_name":"Shanghai Normal University","ror":"https://ror.org/01cxqmw89","country_code":"CN","type":"education","lineage":["https://openalex.org/I21945476"]},{"id":"https://openalex.org/I66867065","display_name":"East China Normal University","ror":"https://ror.org/02n96ep67","country_code":"CN","type":"education","lineage":["https://openalex.org/I66867065"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Feng Liu","raw_affiliation_strings":["School of Computer Science and Technology, East China Normal University, shanghai, shanghai, China","School of Psychology, Shanghai Jiao Tong University, shanghai, shanghai, China"],"raw_orcid":"https://orcid.org/0000-0002-5289-5761","affiliations":[{"raw_affiliation_string":"School of Computer Science and Technology, East China Normal University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I66867065","https://openalex.org/I21945476"]},{"raw_affiliation_string":"School of Psychology, Shanghai Jiao Tong University, shanghai, shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.94100701,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"595","last_page":"604"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.45669999718666077,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.45669999718666077,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.31139999628067017,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.0478999987244606,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/autoencoder","display_name":"Autoencoder","score":0.7419000267982483},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.6743999719619751},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.5781999826431274},{"id":"https://openalex.org/keywords/audio-signal","display_name":"Audio signal","score":0.5116000175476074},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.5072000026702881},{"id":"https://openalex.org/keywords/space","display_name":"Space (punctuation)","score":0.4975000023841858},{"id":"https://openalex.org/keywords/signal","display_name":"SIGNAL (programming language)","score":0.42730000615119934},{"id":"https://openalex.org/keywords/feature-learning","display_name":"Feature learning","score":0.39010000228881836},{"id":"https://openalex.org/keywords/audio-mining","display_name":"Audio mining","score":0.3465000092983246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7573999762535095},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.7419000267982483},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.6743999719619751},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.621999979019165},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.5781999826431274},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.5116000175476074},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.5072000026702881},{"id":"https://openalex.org/C2778572836","wikidata":"https://www.wikidata.org/wiki/Q380933","display_name":"Space (punctuation)","level":2,"score":0.4975000023841858},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4332999885082245},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.42730000615119934},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42329999804496765},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.39010000228881836},{"id":"https://openalex.org/C157968479","wikidata":"https://www.wikidata.org/wiki/Q3079876","display_name":"Audio mining","level":4,"score":0.3465000092983246},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.34310001134872437},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.33649998903274536},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.3255000114440918},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C2778488704","wikidata":"https://www.wikidata.org/wiki/Q15190726","display_name":"Audio equipment","level":2,"score":0.3034999966621399},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.2971000075340271},{"id":"https://openalex.org/C134400042","wikidata":"https://www.wikidata.org/wiki/Q2372244","display_name":"Symbol (formal)","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.28630000352859497},{"id":"https://openalex.org/C128422554","wikidata":"https://www.wikidata.org/wiki/Q20077126","display_name":"Sound recording and reproduction","level":2,"score":0.2703000009059906},{"id":"https://openalex.org/C167940747","wikidata":"https://www.wikidata.org/wiki/Q63727227","display_name":"Audio signal flow","level":5,"score":0.2689000070095062},{"id":"https://openalex.org/C2776141515","wikidata":"https://www.wikidata.org/wiki/Q1274479","display_name":"Repetition (rhetorical device)","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.26019999384880066},{"id":"https://openalex.org/C197424946","wikidata":"https://www.wikidata.org/wiki/Q1165717","display_name":"Waveform","level":3,"score":0.26019999384880066},{"id":"https://openalex.org/C190839683","wikidata":"https://www.wikidata.org/wiki/Q2448197","display_name":"Train","level":2,"score":0.25839999318122864},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.25200000405311584}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3805622.3810789","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810789","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2606.14791","is_oa":true,"landing_page_url":"https://arxiv.org/abs/2606.14791","pdf_url":"https://arxiv.org/pdf/2606.14791","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3805622.3810789","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3805622.3810789","pdf_url":null,"source":null,"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2026 International Conference on Multimedia Retrieval","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4877870672","display_name":null,"funder_award_id":"202610269116G","funder_id":"https://openalex.org/F4320322999","funder_display_name":"Shanghai Jiao Tong University"},{"id":"https://openalex.org/G8762691367","display_name":null,"funder_award_id":"T541PRP49003","funder_id":"https://openalex.org/F4320322999","funder_display_name":"Shanghai Jiao Tong University"},{"id":"https://openalex.org/G8791591342","display_name":null,"funder_award_id":"25X010506040","funder_id":"https://openalex.org/F4320322999","funder_display_name":"Shanghai Jiao Tong University"}],"funders":[{"id":"https://openalex.org/F4320322999","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2092939357","https://openalex.org/W2163922914","https://openalex.org/W2509065397","https://openalex.org/W2593116425","https://openalex.org/W2605102758","https://openalex.org/W2771361008","https://openalex.org/W2936774411","https://openalex.org/W2973042454","https://openalex.org/W2973049979","https://openalex.org/W3016970897","https://openalex.org/W3041561163","https://openalex.org/W3094550259","https://openalex.org/W3162391496","https://openalex.org/W3196974791","https://openalex.org/W3205475937","https://openalex.org/W3206996142","https://openalex.org/W3209059054","https://openalex.org/W3209984917","https://openalex.org/W4205689591","https://openalex.org/W4297841790","https://openalex.org/W4297841853","https://openalex.org/W4310873011","https://openalex.org/W4313156423","https://openalex.org/W4380434618","https://openalex.org/W4386075504","https://openalex.org/W4390738640","https://openalex.org/W4392979802","https://openalex.org/W4408355740","https://openalex.org/W4413157291","https://openalex.org/W4413979917","https://openalex.org/W4416251029","https://openalex.org/W7138238529","https://openalex.org/W7155075836"],"related_works":[],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1],"advances":[2],"audio":[3,37,64],"representation":[4],"for":[5],"multimedia":[6],"analysis.":[7],"However,":[8],"prevailing":[9],"data-centric":[10],"approaches":[11],"rely":[12],"on":[13,47,69,73,76,80,91],"massive":[14],"real-world":[15],"corpora,":[16],"increasing":[17],"training":[18],"costs,":[19],"curation":[20],"burdens,":[21],"and":[22,55,78,104],"privacy":[23],"barriers.":[24],"To":[25],"address":[26],"this,":[27],"we":[28],"present":[29],"AudioPG,":[30],"a":[31,43,92],"procedural":[32,118],"synthesis":[33,119],"framework":[34],"eliminating":[35],"real":[36,63],"recordings":[38],"during":[39],"pre-training.":[40],"AudioPG":[41],"trains":[42],"Transformer-based":[44],"masked":[45],"autoencoder":[46],"waveforms":[48],"generated":[49],"on-the-fly":[50],"from":[51],"basic":[52],"acoustic":[53],"primitives":[54],"composition":[56],"rules.":[57],"The":[58],"encoder":[59],"transfers":[60],"effectively":[61],"to":[62],"benchmarks,":[65],"achieving":[66],"90.60%":[67],"accuracy":[68],"ESC-50,":[70],"0.546":[71],"mAP":[72],"FSD50K,":[74],"88.17%":[75],"UrbanSound8K,":[77],"97.03%":[79],"Speech":[81],"Commands":[82],"V2.":[83],"Notably,":[84],"pre-training":[85,124],"completes":[86],"in":[87,108],"under":[88],"20":[89],"minutes":[90],"single":[93],"GPU.":[94],"Latent":[95],"space":[96],"analysis":[97],"reveals":[98],"physical":[99],"factors,":[100],"including":[101],"fundamental":[102],"frequency":[103],"relative":[105],"intensity,":[106],"emerge":[107],"orthogonal":[109],"subspaces,":[110],"making":[111],"representations":[112],"linearly":[113],"decodable.":[114],"These":[115],"results":[116],"establish":[117],"as":[120],"an":[121],"efficient,":[122],"interpretable":[123],"signal":[125],"when":[126],"large-scale":[127],"corpora":[128],"are":[129],"unavailable.":[130],"Our":[131],"code":[132],"is":[133],"available":[134],"at:":[135],"https://github.com/Freyliu0516/audioPG.":[136]},"counts_by_year":[],"updated_date":"2026-06-18T10:00:31.954636","created_date":"2026-06-16T00:00:00"}
