{"id":"https://openalex.org/W7152464020","doi":"https://doi.org/10.48550/arxiv.2604.06356","title":"In-Context Learning in Speech Language Models: Analyzing the Role of Acoustic Features, Linguistic Structure, and Induction Heads","display_name":"In-Context Learning in Speech Language Models: Analyzing the Role of Acoustic Features, Linguistic Structure, and Induction Heads","publication_year":2026,"publication_date":"2026-04-07","ids":{"openalex":"https://openalex.org/W7152464020","doi":"https://doi.org/10.48550/arxiv.2604.06356"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.06356","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06356","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.06356","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013578128","display_name":"Charlotte Pouw","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pouw, Charlotte","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004874606","display_name":"Hosein Mohebbi","orcid":"https://orcid.org/0000-0001-8184-7825"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mohebbi, Hosein","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5047382001","display_name":"Afra Alishahi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alishahi, Afra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5007928903","display_name":"Willem Zuidema","orcid":"https://orcid.org/0000-0002-2362-5447"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuidema, Willem","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5235000252723694,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.5235000252723694,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.06750000268220901,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10403","display_name":"Phonetics and Phonology Research","score":0.0617000013589859,"subfield":{"id":"https://openalex.org/subfields/3205","display_name":"Experimental and Cognitive Psychology"},"field":{"id":"https://openalex.org/fields/32","display_name":"Psychology"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/mirroring","display_name":"Mirroring","score":0.6866999864578247},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5879999995231628},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5001000165939331},{"id":"https://openalex.org/keywords/affect","display_name":"Affect (linguistics)","score":0.4124000072479248},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.37860000133514404},{"id":"https://openalex.org/keywords/spoken-language","display_name":"Spoken language","score":0.3677999973297119},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.35370001196861267}],"concepts":[{"id":"https://openalex.org/C189645446","wikidata":"https://www.wikidata.org/wiki/Q350865","display_name":"Mirroring","level":2,"score":0.6866999864578247},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5879999995231628},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5709999799728394},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5238999724388123},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5001000165939331},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.430400013923645},{"id":"https://openalex.org/C2776035688","wikidata":"https://www.wikidata.org/wiki/Q1606558","display_name":"Affect (linguistics)","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3833000063896179},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.37860000133514404},{"id":"https://openalex.org/C2776230583","wikidata":"https://www.wikidata.org/wiki/Q1322198","display_name":"Spoken language","level":2,"score":0.3677999973297119},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.35370001196861267},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.33390000462532043},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.32820001244544983},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3255999982357025},{"id":"https://openalex.org/C43617652","wikidata":"https://www.wikidata.org/wiki/Q7575399","display_name":"Speech production","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.3093999922275543},{"id":"https://openalex.org/C61328038","wikidata":"https://www.wikidata.org/wiki/Q3358061","display_name":"Speech processing","level":2,"score":0.3012000024318695},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.29409998655319214},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.28139999508857727},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.28110000491142273},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2752000093460083}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.06356","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06356","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.06356","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.06356","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"In-Context":[0],"Learning":[1],"(ICL)":[2],"has":[3],"been":[4],"extensively":[5],"studied":[6],"in":[7,15,29,81,97,123],"text-only":[8],"Language":[9,31],"Models,":[10],"but":[11],"remains":[12],"largely":[13],"unexplored":[14],"the":[16,36,52,55,58,62,71,74,78,98,118,136,142],"speech":[17,80],"domain.":[18],"Here,":[19],"we":[20,116],"investigate":[21,117],"how":[22,50],"linguistic":[23],"and":[24,66,93,103,110,126],"acoustic":[25,75],"features":[26],"affect":[27],"ICL":[28,45,91,125,144],"Speech":[30],"Models.":[32],"We":[33,84],"focus":[34],"on":[35,108],"Text-to-Speech":[37],"(TTS)":[38],"task,":[39],"which":[40],"allows":[41],"us":[42],"to":[43,68],"analyze":[44],"from":[46,57,148],"two":[47],"angles:":[48],"(1)":[49],"accurately":[51],"model":[53,72],"infers":[54],"task":[56],"demonstrations":[59],"(i.e.,":[60],"generating":[61],"correct":[63],"spoken":[64],"content),":[65],"(2)":[67],"what":[69],"extent":[70],"mimics":[73],"characteristics":[76],"of":[77,120],"demonstration":[79],"its":[82],"output.":[83],"find":[85],"that":[86,128],"speaking":[87],"rate":[88],"strongly":[89],"affects":[90],"performance":[92,109],"is":[94],"also":[95],"mimicked":[96],"output,":[99],"whereas":[100],"pitch":[101],"range":[102],"intensity":[104],"have":[105],"little":[106],"impact":[107],"are":[111],"not":[112],"consistently":[113],"reproduced.":[114],"Finally,":[115],"role":[119],"induction":[121,138],"heads":[122,130,139],"speech-based":[124],"show":[127],"these":[129],"play":[131],"a":[132],"causal":[133],"role:":[134],"ablating":[135],"top-k":[137],"completely":[140],"removes":[141],"model's":[143],"ability,":[145],"mirroring":[146],"findings":[147],"text-based":[149],"ICL.":[150]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-10T00:00:00"}
