{"id":"https://openalex.org/W7138235146","doi":"https://doi.org/10.48550/arxiv.2603.13262","title":"Evaluation of Audio Language Models for Fairness, Safety, and Security","display_name":"Evaluation of Audio Language Models for Fairness, Safety, and Security","publication_year":2026,"publication_date":"2026-02-25","ids":{"openalex":"https://openalex.org/W7138235146","doi":"https://doi.org/10.48550/arxiv.2603.13262"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13262","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13262","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13262","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058078120","display_name":"Ranya Aloufi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aloufi, Ranya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129669151","display_name":"Srishti Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gupta, Srishti","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066031323","display_name":"Soumya Shaw","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shaw, Soumya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129747265","display_name":"Battista Biggio","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Biggio, Battista","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5049219646","display_name":"Lea Sch\u00f6nherr","orcid":"https://orcid.org/0000-0003-3779-7781"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sch\u00f6nherr, Lea","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2094999998807907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.2094999998807907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.19470000267028809,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12262","display_name":"Hate Speech and Cyberbullying Detection","score":0.07779999822378159,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/paralanguage","display_name":"Paralanguage","score":0.6194999814033508},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5360000133514404},{"id":"https://openalex.org/keywords/representation","display_name":"Representation (politics)","score":0.3352999985218048},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.3240000009536743},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.32260000705718994},{"id":"https://openalex.org/keywords/taxonomy","display_name":"Taxonomy (biology)","score":0.30000001192092896},{"id":"https://openalex.org/keywords/formalism","display_name":"Formalism (music)","score":0.2793999910354614},{"id":"https://openalex.org/keywords/conflation","display_name":"Conflation","score":0.2786000072956085}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7695000171661377},{"id":"https://openalex.org/C133378560","wikidata":"https://www.wikidata.org/wiki/Q1753225","display_name":"Paralanguage","level":2,"score":0.6194999814033508},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5900999903678894},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5360000133514404},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4821000099182129},{"id":"https://openalex.org/C2776359362","wikidata":"https://www.wikidata.org/wiki/Q2145286","display_name":"Representation (politics)","level":3,"score":0.3352999985218048},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.32260000705718994},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.31679999828338623},{"id":"https://openalex.org/C58642233","wikidata":"https://www.wikidata.org/wiki/Q8269924","display_name":"Taxonomy (biology)","level":2,"score":0.30000001192092896},{"id":"https://openalex.org/C73301696","wikidata":"https://www.wikidata.org/wiki/Q5469984","display_name":"Formalism (music)","level":3,"score":0.2793999910354614},{"id":"https://openalex.org/C130440534","wikidata":"https://www.wikidata.org/wiki/Q14946528","display_name":"Conflation","level":2,"score":0.2786000072956085},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2754000127315521},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.26649999618530273},{"id":"https://openalex.org/C161301231","wikidata":"https://www.wikidata.org/wiki/Q3478658","display_name":"Knowledge representation and reasoning","level":2,"score":0.266400009393692},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2630999982357025},{"id":"https://openalex.org/C127220857","wikidata":"https://www.wikidata.org/wiki/Q2719318","display_name":"Audio signal processing","level":4,"score":0.2615000009536743},{"id":"https://openalex.org/C129792486","wikidata":"https://www.wikidata.org/wiki/Q1050419","display_name":"Language identification","level":3,"score":0.2590999901294708},{"id":"https://openalex.org/C11693617","wikidata":"https://www.wikidata.org/wiki/Q181839","display_name":"Pragmatics","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.25450000166893005},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13262","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13262","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13262","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13262","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.6975803971290588}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Audio":[0],"large":[1,15],"language":[2,16,190],"models":[3],"(ALLMs)":[4],"have":[5],"recently":[6],"advanced":[7],"spoken":[8],"interaction":[9],"by":[10],"integrating":[11],"speech":[12],"processing":[13],"with":[14],"models.":[17,191],"However,":[18],"existing":[19],"evaluations":[20,54],"of":[21,81,91,102,188],"fairness,":[22],"safety,":[23],"and":[24,40,66,79,99,128,134,148,157,161],"security":[25],"(FSS)":[26],"remain":[27],"fragmented,":[28],"largely":[29],"because":[30],"ALLMs":[31,82],"differ":[32],"fundamentally":[33],"in":[34,152],"how":[35,174],"acoustic":[36,175],"information":[37,176],"is":[38,170,177],"represented":[39],"where":[41],"semantic":[42,103,122,180],"reasoning":[43,104],"occurs.":[44],"Differences":[45],"that":[46,83,120,167],"are":[47],"rarely":[48],"made":[49],"explicit.":[50],"As":[51],"a":[52,75,116],"result,":[53],"often":[55],"conflate":[56],"structurally":[57],"distinct":[58],"systems,":[59],"obscuring":[60],"the":[61,89,100,112,183],"relationship":[62],"between":[63,159],"model":[64],"design":[65],"observed":[67],"FSS":[68,168],"behavior.":[69],"In":[70],"this":[71,142],"work,":[72],"we":[73,114],"introduce":[74],"structural":[76],"taxonomy":[77],"(system-level":[78],"representational)":[80],"categorizes":[84],"systems":[85,147],"along":[86],"two":[87,145],"axes:":[88],"form":[90],"audio":[92,138,160,189],"input":[93],"representation":[94],"(e.g.,":[95,105],"discrete":[96],"vs.":[97],"continuous)":[98],"locus":[101],"cascaded,":[106],"multimodal,":[107],"or":[108],"audio-native).":[109],"Building":[110],"on":[111],"taxonomy,":[113],"propose":[115],"unified":[117],"evaluation":[118,187],"framework":[119,143],"assesses":[121],"invariance":[123],"under":[124,131],"paralinguistic":[125],"variation,":[126],"refusal":[127,153],"toxicity":[129,158],"behavior":[130,169],"unsafe":[132],"prompts,":[133],"robustness":[135],"to":[136,144,173],"adversarial":[137],"perturbations.":[139],"We":[140],"apply":[141],"representative":[146],"observe":[149],"systematic":[150],"differences":[151],"rates,":[154],"attack":[155],"success,":[156],"text":[162],"inputs.":[163],"Our":[164],"findings":[165],"demonstrate":[166],"tightly":[171],"coupled":[172],"integrated":[178],"into":[179],"reasoning,":[181],"underscoring":[182],"need":[184],"for":[185],"structure-aware":[186]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-18T00:00:00"}
