{"id":"https://openalex.org/W7148343048","doi":"https://doi.org/10.1109/asru65441.2025.11434696","title":"ULTRAS - Unified Learning of Transformer Representations for Audio and Speech Signals","display_name":"ULTRAS - Unified Learning of Transformer Representations for Audio and Speech Signals","publication_year":2025,"publication_date":"2025-12-06","ids":{"openalex":"https://openalex.org/W7148343048","doi":"https://doi.org/10.1109/asru65441.2025.11434696"},"language":null,"primary_location":{"id":"doi:10.1109/asru65441.2025.11434696","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434696","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132820377","display_name":"P E Ameenudeen","orcid":null},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"P E Ameenudeen","raw_affiliation_strings":["Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India","institution_ids":["https://openalex.org/I59270414"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110157094","display_name":"C. K. Narayanan","orcid":null},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Charumathi Narayanan","raw_affiliation_strings":["Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India","institution_ids":["https://openalex.org/I59270414"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002536077","display_name":"Sriram Ganapathy","orcid":"https://orcid.org/0000-0002-5779-9066"},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Sriram Ganapathy","raw_affiliation_strings":["Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India"],"affiliations":[{"raw_affiliation_string":"Indian Institute of Science,LEAP Laboratory Electrical Engineering,Bangalore,India","institution_ids":["https://openalex.org/I59270414"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5132820377"],"corresponding_institution_ids":["https://openalex.org/I59270414"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.87554489,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.4350000023841858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.4350000023841858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.3271999955177307,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.0640999972820282,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5437999963760376},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.29019999504089355},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.2759999930858612},{"id":"https://openalex.org/keywords/speech-synthesis","display_name":"Speech synthesis","score":0.2524000108242035}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5504999756813049},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5437999963760376},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.49070000648498535},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39100000262260437},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.31470000743865967},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2888000011444092},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C14999030","wikidata":"https://www.wikidata.org/wiki/Q16346","display_name":"Speech synthesis","level":2,"score":0.2524000108242035},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.2502000033855438}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/asru65441.2025.11434696","is_oa":false,"landing_page_url":"https://doi.org/10.1109/asru65441.2025.11434696","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W1494198834","https://openalex.org/W2038484192","https://openalex.org/W2052666245","https://openalex.org/W2146334809","https://openalex.org/W2593116425","https://openalex.org/W2726515241","https://openalex.org/W3094502228","https://openalex.org/W3196974791","https://openalex.org/W3197580070","https://openalex.org/W3206996142","https://openalex.org/W3209059054","https://openalex.org/W4313156423","https://openalex.org/W4376481237","https://openalex.org/W4390738640","https://openalex.org/W4394862844","https://openalex.org/W4402716129","https://openalex.org/W4415433942"],"related_works":[],"abstract_inverted_index":{"Self-supervised":[0],"learning":[1,18],"(SSL)":[2],"has":[3],"driven":[4],"impressive":[5],"advances":[6],"in":[7],"speech":[8,114],"processing":[9],"by":[10],"adopting":[11],"time-domain":[12],"prediction":[13],"objectives,":[14],"while":[15],"audio":[16,116],"representation":[17],"frameworks":[19],"operate":[20],"on":[21,71,89,110],"time-frequency":[22],"spectrograms.":[23],"Models":[24],"optimized":[25],"for":[26,38,49],"one":[27],"paradigm":[28],"struggle":[29],"to":[30,32,101],"transfer":[31],"the":[33,36,55,66,72,99,122],"other,":[34],"highlighting":[35],"need":[37],"a":[39,95,111],"joint":[40],"framework.":[41],"We":[42],"propose":[43],"Unified":[44],"Learning":[45],"of":[46,65,77,84,113],"Transformer":[47],"Representations":[48],"Audio":[50],"and":[51,57,91,104,115],"Speech":[52],"(ULTRAS),":[53],"where":[54,118],"masking":[56],"predictive":[58,82],"modeling":[59,83],"is":[60,87],"performed":[61,88,109],"over":[62,128],"long":[63],"patches":[64],"data.":[67],"The":[68,81],"model,":[69],"based":[70],"transformer":[73],"architecture,":[74],"encodes":[75],"spectral-patches":[76],"log-mel":[78],"spectrogram":[79],"features.":[80],"masked":[85],"segments":[86],"spectral":[90],"temporal":[92],"targets":[93],"using":[94],"combined":[96],"loss-function,":[97],"forcing":[98],"representations":[100],"encode":[102],"time":[103],"frequency":[105],"traits.":[106],"Experiments":[107],"are":[108],"variety":[112],"tasks,":[117],"we":[119],"illustrate":[120],"that":[121],"ULTRAS":[123],"framework":[124],"achieves":[125],"improved":[126],"performance":[127],"other":[129],"established":[130],"baselines.":[131]},"counts_by_year":[],"updated_date":"2026-04-03T16:44:17.987007","created_date":"2026-04-03T00:00:00"}
