{"id":"https://openalex.org/W7141142951","doi":"https://doi.org/10.48550/arxiv.2603.24844","title":"Reaching Beyond the Mode: RL for Distributional Reasoning in Language Models","display_name":"Reaching Beyond the Mode: RL for Distributional Reasoning in Language Models","publication_year":2026,"publication_date":"2026-03-25","ids":{"openalex":"https://openalex.org/W7141142951","doi":"https://doi.org/10.48550/arxiv.2603.24844"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.24844","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24844","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.24844","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102790834","display_name":"Isha Puri","orcid":"https://orcid.org/0009-0000-8069-8506"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Puri, Isha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040083642","display_name":"Mehul Damani","orcid":"https://orcid.org/0000-0001-5790-376X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Damani, Mehul","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130782888","display_name":"Idan Shenfeld","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shenfeld, Idan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130783310","display_name":"Marzyeh Ghassemi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghassemi, Marzyeh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130793947","display_name":"Jacob Andreas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Andreas, Jacob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130744416","display_name":"Yoon Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Yoon","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5102790834"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.47440001368522644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.47440001368522644,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.15070000290870667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.14190000295639038,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.557200014591217},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.541700005531311},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.4860000014305115},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.461899995803833},{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.4165000021457672},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.3666999936103821},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.36309999227523804},{"id":"https://openalex.org/keywords/probability-distribution","display_name":"Probability distribution","score":0.35929998755455017}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6942999958992004},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6129000186920166},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.557200014591217},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.541700005531311},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5109000205993652},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.4860000014305115},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.461899995803833},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.4165000021457672},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.3666999936103821},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.36309999227523804},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C177769412","wikidata":"https://www.wikidata.org/wiki/Q278090","display_name":"Prior probability","level":3,"score":0.3357999920845032},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.30820000171661377},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.2840000092983246},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.28049999475479126},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.27950000762939453},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2777000069618225},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C113336015","wikidata":"https://www.wikidata.org/wiki/Q574010","display_name":"Complete information","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.26170000433921814},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2574999928474426}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.24844","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24844","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.24844","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.24844","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.6007874608039856,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Given":[0],"a":[1,3,9,25,34,99,131,197],"question,":[2],"language":[4],"model":[5],"(LM)":[6],"implicitly":[7],"encodes":[8],"distribution":[10,23],"over":[11,111],"possible":[12],"answers.":[13,95],"In":[14,67],"practice,":[15],"post-training":[16],"procedures":[17,205],"for":[18,36,83,104],"LMs":[19,73,106],"often":[20],"collapse":[21],"this":[22,30],"onto":[24],"single":[26,132,163],"dominant":[27],"mode.":[28],"While":[29],"is":[31],"generally":[32],"not":[33],"problem":[35],"benchmark-style":[37],"evaluations":[38],"that":[39],"assume":[40],"one":[41],"correct":[42],"answer,":[43],"many":[44],"real-world":[45],"tasks":[46],"inherently":[47],"involve":[48],"multiple":[49,76,112,127,177],"valid":[50],"answers":[51,113,129,178],"or":[52],"irreducible":[53],"uncertainty.":[54],"Examples":[55],"include":[56],"medical":[57,147],"diagnosis,":[58],"ambiguous":[59],"question":[60],"answering,":[61],"and":[62,86,149,157,199,210],"settings":[63],"with":[64,80,169],"incomplete":[65],"information.":[66],"these":[68],"cases,":[69],"we":[70,152],"would":[71],"like":[72],"to":[74,92,107,121,124,162,175,202],"generate":[75,93,126,176],"plausible":[77],"hypotheses,":[78],"ideally":[79],"confidence":[81],"estimates":[82],"each":[84],"one,":[85],"without":[87],"computationally":[88],"intensive":[89],"repeated":[90],"sampling":[91],"non-modal":[94],"This":[96],"paper":[97],"describes":[98],"multi-answer":[100,194],"reinforcement":[101],"learning":[102],"approach":[103,171],"training":[105],"perform":[108],"distributional":[109],"reasoning":[110],"during":[114],"inference.":[115],"We":[116],"modify":[117],"the":[118,141],"RL":[119,195],"objective":[120],"enable":[122],"models":[123],"explicitly":[125],"candidate":[128],"in":[130],"forward":[133],"pass,":[134],"internalizing":[135],"aspects":[136],"of":[137],"inference-time":[138,203],"search":[139],"into":[140],"model's":[142],"generative":[143],"process.":[144],"Across":[145],"question-answering,":[146],"diagnostic,":[148],"coding":[150,183],"benchmarks,":[151],"observe":[153],"improved":[154],"diversity,":[155],"coverage,":[156],"set-level":[158],"calibration":[159],"scores":[160],"compared":[161],"answer":[164],"trained":[165,168],"baselines.":[166],"Models":[167],"our":[170],"require":[172],"fewer":[173],"tokens":[174],"than":[179],"competing":[180],"approaches.":[181],"On":[182],"tasks,":[184],"they":[185],"are":[186],"also":[187],"substantially":[188],"more":[189,211],"accurate.":[190],"These":[191],"results":[192],"position":[193],"as":[196,207],"principled":[198],"compute-efficient":[200],"alternative":[201],"scaling":[204],"such":[206],"best-of-k.":[208],"Code":[209],"information":[212],"can":[213],"be":[214],"found":[215],"at":[216],"https://multi-answer-rl.github.io/.":[217]},"counts_by_year":[],"updated_date":"2026-03-28T06:16:51.555046","created_date":"2026-03-28T00:00:00"}
