{"id":"https://openalex.org/W4415255189","doi":"https://doi.org/10.48550/arxiv.2509.17794","title":"Learning to vary: Teaching LMs to reproduce human linguistic variability in next-word prediction","display_name":"Learning to vary: Teaching LMs to reproduce human linguistic variability in next-word prediction","publication_year":2025,"publication_date":"2025-09-22","ids":{"openalex":"https://openalex.org/W4415255189","doi":"https://doi.org/10.48550/arxiv.2509.17794"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.17794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.17794","pdf_url":"https://arxiv.org/pdf/2509.17794","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.17794","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014656840","display_name":"Tobias Groot","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Groot, Tobias","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120025992","display_name":"Salo Lacunes","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lacunes, Salo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5092574996","display_name":"Evgenia Ilia","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ilia, Evgenia","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5014656840"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9656000137329102,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.631600022315979},{"id":"https://openalex.org/keywords/subject","display_name":"Subject (documents)","score":0.49129998683929443},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.4902999997138977},{"id":"https://openalex.org/keywords/variation","display_name":"Variation (astronomy)","score":0.45100000500679016},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.44429999589920044},{"id":"https://openalex.org/keywords/population","display_name":"Population","score":0.4334999918937683},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4300999939441681},{"id":"https://openalex.org/keywords/natural","display_name":"Natural (archaeology)","score":0.41100001335144043}],"concepts":[{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.631600022315979},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5716000199317932},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.5241000056266785},{"id":"https://openalex.org/C2777855551","wikidata":"https://www.wikidata.org/wiki/Q12310021","display_name":"Subject (documents)","level":2,"score":0.49129998683929443},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.4902999997138977},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.45100000500679016},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.44429999589920044},{"id":"https://openalex.org/C2908647359","wikidata":"https://www.wikidata.org/wiki/Q2625603","display_name":"Population","level":2,"score":0.4334999918937683},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4300999939441681},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4284999966621399},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4113999903202057},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.41100001335144043},{"id":"https://openalex.org/C2781202465","wikidata":"https://www.wikidata.org/wiki/Q18346297","display_name":"Lexical diversity","level":3,"score":0.37130001187324524},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.367900013923645},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.3400000035762787},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.33000001311302185},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3237000107765198},{"id":"https://openalex.org/C74672266","wikidata":"https://www.wikidata.org/wiki/Q815859","display_name":"Language acquisition","level":2,"score":0.3190000057220459},{"id":"https://openalex.org/C2777299769","wikidata":"https://www.wikidata.org/wiki/Q3707858","display_name":"Type (biology)","level":2,"score":0.3075000047683716},{"id":"https://openalex.org/C2993724205","wikidata":"https://www.wikidata.org/wiki/Q315","display_name":"Human language","level":2,"score":0.28859999775886536},{"id":"https://openalex.org/C172205122","wikidata":"https://www.wikidata.org/wiki/Q777864","display_name":"Linguistic description","level":2,"score":0.274399995803833},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.26829999685287476},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2565999925136566}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.17794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.17794","pdf_url":"https://arxiv.org/pdf/2509.17794","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.17794","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.17794","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.17794","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.17794","pdf_url":"https://arxiv.org/pdf/2509.17794","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Natural":[0],"language":[1,34],"generation":[2],"(NLG)":[3],"tasks":[4],"are":[5,38,44],"often":[6],"subject":[7],"to":[8,28,46,119,176],"inherent":[9,50,99],"variability;":[10,179],"e.g.":[11],"predicting":[12],"the":[13,30,49,86,173],"next":[14],"word":[15,111],"given":[16],"a":[17],"context":[18,114],"has":[19],"multiple":[20,26,109],"valid":[21],"responses,":[22],"evident":[23],"when":[24,140],"asking":[25],"humans":[27],"complete":[29],"task.":[31],"While":[32],"having":[33],"models":[35],"(LMs)":[36],"that":[37,42,68,168,183],"aligned":[39],"pluralistically,":[40],"so":[41],"they":[43],"able":[45],"reproduce":[47,72,120,177],"well":[48],"diversity":[51],"in":[52],"perspectives":[53],"of":[54,58,75,88,91,98],"an":[55],"entire":[56],"population":[57],"interest":[59],"is":[60],"clearly":[61],"beneficial,":[62],"Ilia":[63],"and":[64,133,136,143,157,164,186],"Aziz":[65],"(2024)":[66],"show":[67],"LMs":[69,92,107],"do":[70],"not":[71],"this":[73,81,96],"type":[74,97],"linguistic":[76,122,178],"variability":[77,123],"well.":[78],"They":[79],"speculate":[80],"inability":[82],"might":[83],"stem":[84],"from":[85],"lack":[87],"consistent":[89],"training":[90,106],"with":[93],"data":[94],"reflecting":[95],"variability.":[100,188],"As":[101],"such,":[102],"we":[103],"investigate":[104],"whether":[105],"on":[108],"plausible":[110],"continuations":[112],"per":[113],"can":[115],"improve":[116],"their":[117,138],"ability":[118,175],"human":[121,156],"for":[124,131,181],"next-word":[125,159],"prediction.":[126],"We":[127],"employ":[128],"fine-tuning":[129,141,171],"techniques":[130],"pre-trained":[132],"instruction-tuned":[134],"models;":[135],"demonstrate":[137],"potential":[139],"GPT-2":[142],"Mistral-7B-IT,":[144],"using":[145],"Provo":[146],"Corpus.":[147],"Our":[148],"evaluation,":[149],"which":[150],"measures":[151],"divergence":[152],"among":[153],"empirically":[154],"estimated":[155],"model":[158],"distributions":[160],"across":[161],"contexts":[162,182],"before":[163],"after":[165],"fine-tuning,":[166],"shows":[167],"our":[169],"multi-label":[170],"improves":[172],"LMs'":[174],"both":[180],"admit":[184],"higher":[185],"lower":[187]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-16T00:00:00"}
