{"id":"https://openalex.org/W7125129110","doi":"https://doi.org/10.48550/arxiv.2601.12594","title":"SLAP: Scalable Language-Audio Pretraining with Variable-Duration Audio and Multi-Objective Training","display_name":"SLAP: Scalable Language-Audio Pretraining with Variable-Duration Audio and Multi-Objective Training","publication_year":2026,"publication_date":"2026-01-18","ids":{"openalex":"https://openalex.org/W7125129110","doi":"https://doi.org/10.48550/arxiv.2601.12594"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.12594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.12594","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123517070","display_name":"Xinhao Mei","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Mei, Xinhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123520119","display_name":"Gael Le Lan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lan, Gael Le","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123525533","display_name":"Haohe Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Haohe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031088292","display_name":"Zhaoheng Ni","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ni, Zhaoheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034208747","display_name":"Varun Nagaraja","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nagaraja, Varun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123526145","display_name":"Yang Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123522018","display_name":"Yangyang Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Yangyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123509245","display_name":"Vikas Chandra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chandra, Vikas","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5123517070"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7265999913215637,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.7265999913215637,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.11940000206232071,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.09780000150203705,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6263999938964844},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.5159000158309937},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.48730000853538513},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.42419999837875366},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.41260001063346863},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.38359999656677246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7944999933242798},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6263999938964844},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5254999995231628},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.5159000158309937},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.48730000853538513},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4345000088214874},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.42419999837875366},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.41260001063346863},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.38359999656677246},{"id":"https://openalex.org/C64922751","wikidata":"https://www.wikidata.org/wiki/Q4650799","display_name":"Audio signal","level":3,"score":0.34700000286102295},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.33970001339912415},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.32109999656677246},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.31439998745918274},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2653000056743622},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.260699987411499}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.12594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.12594","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.12594","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7200863361358643}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Contrastive":[0],"language-audio":[1,99],"pretraining":[2,100],"(CLAP)":[3],"has":[4],"achieved":[5],"notable":[6],"success":[7],"in":[8,61,125],"learning":[9,81,131],"semantically":[10],"rich":[11],"audio":[12,44,85,108,135,150],"representations":[13],"and":[14,54,110,122,148],"is":[15],"widely":[16],"adopted":[17],"for":[18],"various":[19],"audio-related":[20],"tasks.":[21],"However,":[22],"current":[23],"CLAP":[24,48],"models":[25,49],"face":[26],"several":[27],"key":[28],"limitations.":[29],"First,":[30],"they":[31],"are":[32,50],"typically":[33],"trained":[34],"on":[35,74,145],"relatively":[36],"small":[37],"datasets,":[38],"often":[39],"comprising":[40],"a":[41,126],"few":[42],"million":[43,103],"samples.":[45],"Second,":[46],"existing":[47],"restricted":[51],"to":[52,101],"short":[53],"fixed":[55],"duration,":[56],"which":[57,77,97],"constrains":[58],"their":[59],"usage":[60],"real-world":[62],"scenarios":[63],"with":[64,106,119],"variable-duration":[65],"audio.":[66],"Third,":[67],"the":[68,80,130],"standard":[69],"contrastive":[70,117],"training":[71,113],"objective":[72],"operates":[73],"global":[75],"representations,":[76],"may":[78],"hinder":[79],"of":[82,132],"dense,":[83],"fine-grained":[84],"features.":[86],"To":[87],"address":[88],"these":[89],"challenges,":[90],"we":[91],"introduce":[92],"Scalable":[93],"Language-Audio":[94],"Pretraining":[95],"(SLAP),":[96],"scales":[98],"109":[102],"audio-text":[104,146],"pairs":[105],"variable":[107],"durations":[109],"incorporates":[111],"multiple":[112],"objectives.":[114],"SLAP":[115,139],"unifies":[116],"loss":[118],"additional":[120],"self-supervised":[121],"captioning":[123],"losses":[124],"single-stage":[127],"training,":[128],"facilitating":[129],"richer":[133],"dense":[134],"representations.":[136],"The":[137],"proposed":[138],"model":[140],"achieves":[141],"new":[142],"state-of-the-art":[143],"performance":[144],"retrieval":[147],"zero-shot":[149],"classification":[151],"tasks,":[152],"demonstrating":[153],"its":[154],"effectiveness":[155],"across":[156],"diverse":[157],"benchmarks.":[158]},"counts_by_year":[],"updated_date":"2026-01-22T23:33:04.759266","created_date":"2026-01-22T00:00:00"}
