{"id":"https://openalex.org/W7129726196","doi":"https://doi.org/10.48550/arxiv.2602.14785","title":"SA-SSL-MOS: Self-supervised Learning MOS Prediction with Spectral Augmentation for Generalized Multi-Rate Speech Assessment","display_name":"SA-SSL-MOS: Self-supervised Learning MOS Prediction with Spectral Augmentation for Generalized Multi-Rate Speech Assessment","publication_year":2026,"publication_date":"2026-02-16","ids":{"openalex":"https://openalex.org/W7129726196","doi":"https://doi.org/10.48550/arxiv.2602.14785"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.14785","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126184005","display_name":"Fengyuan Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Cao, Fengyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051156480","display_name":"Xinyu Liang","orcid":"https://orcid.org/0000-0003-1314-9042"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Xinyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092639592","display_name":"Fredrik Cumlin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cumlin, Fredrik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126242378","display_name":"Victor Ungureanu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ungureanu, Victor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126210164","display_name":"Chandan K. A. Reddy","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Reddy, Chandan K. A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014795917","display_name":"Christian Sch\u00fcldt","orcid":"https://orcid.org/0000-0003-3439-0468"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schuldt, Christian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101692633","display_name":"Saikat Chatterjee","orcid":"https://orcid.org/0000-0003-2638-6047"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chatterjee, Saikat","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5126184005"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6761999726295471,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.6761999726295471,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.23849999904632568,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10667","display_name":"Emotion and Mood Recognition","score":0.015399999916553497,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6306999921798706},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.6132000088691711},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5666999816894531},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4862000048160553},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.40790000557899475},{"id":"https://openalex.org/keywords/voice-activity-detection","display_name":"Voice activity detection","score":0.3871000111103058},{"id":"https://openalex.org/keywords/speech-processing","display_name":"Speech processing","score":0.3508000075817108}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7651000022888184},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6306999921798706},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.6132000088691711},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6031000018119812},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5666999816894531},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5554999709129333},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4862000048160553},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.412200003862381},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.40790000557899475},{"id":"https://openalex.org/C204201278","wikidata":"https://www.wikidata.org/wiki/Q1332614","display_name":"Voice activity detection","level":3,"score":0.3871000111103058},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3508000075817108},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.33660000562667847},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.30649998784065247},{"id":"https://openalex.org/C13895895","wikidata":"https://www.wikidata.org/wiki/Q3270773","display_name":"Speech coding","level":2,"score":0.30390000343322754},{"id":"https://openalex.org/C2776182073","wikidata":"https://www.wikidata.org/wiki/Q7575395","display_name":"Speech enhancement","level":3,"score":0.2883000075817108},{"id":"https://openalex.org/C3020001037","wikidata":"https://www.wikidata.org/wiki/Q836575","display_name":"Quality assessment","level":3,"score":0.27140000462532043},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26759999990463257}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Quality Education","score":0.4705720841884613,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Designing":[0],"a":[1,21,33,55,83,98,104,114,123],"speech":[2,13,39,66],"quality":[3],"assessment":[4],"(SQA)":[5],"system":[6],"for":[7,140],"estimating":[8],"mean-opinion-score":[9],"(MOS)":[10],"of":[11,32],"multi-rate":[12,38,125,142,154],"with":[14],"varying":[15],"sampling":[16,75,95],"frequency":[17],"(16-48":[18],"kHz)":[19],"is":[20,58,110,138,156],"challenging":[22],"task.":[23],"The":[24],"challenge":[25],"arises":[26],"due":[27],"to":[28,52,92],"the":[29,108,146],"limited":[30],"availability":[31],"MOS-labeled":[34],"training":[35,106,149],"dataset":[36,118],"comprising":[37],"samples.":[40],"While":[41],"self-supervised":[42],"learning":[43],"(SSL)":[44],"models":[45],"have":[46],"been":[47],"widely":[48],"adopted":[49],"in":[50,73],"SQA":[51],"boost":[53],"performance,":[54],"key":[56],"limitation":[57],"that":[59,87,130,145],"they":[60],"are":[61],"pretrained":[62],"on":[63,113,122],"16":[64],"kHz":[65,94,117],"and":[67,119,144],"therefore":[68],"discard":[69],"high-frequency":[70,89,132],"information":[71,133],"present":[72],"higher":[74],"rates.":[76],"To":[77],"address":[78],"this":[79],"issue,":[80],"we":[81],"propose":[82],"spectrogram-augmented":[84],"SSL":[85,136],"method":[86],"incorporates":[88],"features":[90,137],"(up":[91],"48":[93,116],"rate)":[96],"through":[97],"parallel-branch":[99],"architecture.":[100],"We":[101],"further":[102],"introduce":[103],"two-step":[105,148],"scheme:":[107],"model":[109],"first":[111],"pre-trained":[112],"large":[115],"then":[120],"fine-tuned":[121],"smaller":[124],"dataset.":[126],"Experimental":[127],"results":[128],"show":[129],"leveraging":[131],"overlooked":[134],"by":[135],"crucial":[139],"accurate":[141],"SQA,":[143],"proposed":[147],"substantially":[150],"improves":[151],"generalization":[152],"when":[153],"data":[155],"limited.":[157]},"counts_by_year":[],"updated_date":"2026-02-18T06:25:47.457606","created_date":"2026-02-18T00:00:00"}
