{"id":"https://openalex.org/W7129627178","doi":"https://doi.org/10.48550/arxiv.2602.14386","title":"Beyond Token-Level Policy Gradients for Complex Reasoning with Large Language Models","display_name":"Beyond Token-Level Policy Gradients for Complex Reasoning with Large Language Models","publication_year":2026,"publication_date":"2026-02-16","ids":{"openalex":"https://openalex.org/W7129627178","doi":"https://doi.org/10.48550/arxiv.2602.14386"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.14386","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125000119","display_name":"Mufan Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Mufan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122645642","display_name":"Kehai Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Kehai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101905617","display_name":"Xuefeng Bai","orcid":"https://orcid.org/0000-0001-7044-0683"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bai, Xuefeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126210730","display_name":"Zhengyu Niu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niu, Zhengyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124927795","display_name":"Muyun Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Muyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123486835","display_name":"Tiejun Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Tiejun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126270696","display_name":"Min Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Min","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5125000119"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2451000064611435,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2451000064611435,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.1542000025510788,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.06440000236034393,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.5310999751091003},{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.5252000093460083},{"id":"https://openalex.org/keywords/coding","display_name":"Coding (social sciences)","score":0.4927000105381012},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.3955000042915344},{"id":"https://openalex.org/keywords/automated-reasoning","display_name":"Automated reasoning","score":0.39169999957084656},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.38749998807907104},{"id":"https://openalex.org/keywords/qualitative-reasoning","display_name":"Qualitative reasoning","score":0.3637000024318695},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.33559998869895935}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7358999848365784},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.5310999751091003},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.5252000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5095999836921692},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.4927000105381012},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C195344581","wikidata":"https://www.wikidata.org/wiki/Q2555318","display_name":"Automated reasoning","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C83725634","wikidata":"https://www.wikidata.org/wiki/Q7268699","display_name":"Qualitative reasoning","level":2,"score":0.3637000024318695},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.33559998869895935},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3285999894142151},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.32850000262260437},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.319599986076355},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3131999969482422},{"id":"https://openalex.org/C100776233","wikidata":"https://www.wikidata.org/wiki/Q2532492","display_name":"Bridge (graph theory)","level":2,"score":0.3102000057697296},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2962000072002411},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C47822265","wikidata":"https://www.wikidata.org/wiki/Q854457","display_name":"Complex system","level":2,"score":0.272599995136261},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.26910001039505005},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.25609999895095825}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.14386","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.14386","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.14386","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.14386","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","display_name":"Peace, Justice and strong institutions","score":0.7220818400382996}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Existing":[0],"policy-gradient":[1],"methods":[2],"for":[3,22,143,154],"auto-regressive":[4],"language":[5,156],"models":[6],"typically":[7],"select":[8],"subsequent":[9],"tokens":[10,94],"one":[11],"at":[12],"a":[13,40,59,86],"time":[14],"as":[15,95],"actions":[16],"in":[17,72],"the":[18,33,66,107,137],"policy.":[19],"While":[20],"effective":[21],"many":[23],"generation":[24],"tasks,":[25,38],"such":[26],"an":[27],"approach":[28],"may":[29],"not":[30],"fully":[31],"capture":[32,106],"structure":[34,109],"of":[35,70,91,110,139],"complex":[36,144],"reasoning":[37,71,111,123],"where":[39],"single":[41],"semantic":[42,97],"decision":[43],"is":[44],"often":[45],"realized":[46],"across":[47],"multiple":[48],"tokens--for":[49],"example,":[50],"when":[51],"defining":[52],"variables":[53],"or":[54],"composing":[55],"equations.":[56],"This":[57,99],"introduces":[58],"potential":[60],"mismatch":[61],"between":[62],"token-level":[63,132,140,152],"optimization":[64,115],"and":[65,113,124],"inherently":[67],"block-level":[68,100],"nature":[69],"these":[73],"settings.":[74],"To":[75],"bridge":[76],"this":[77],"gap,":[78],"we":[79],"propose":[80],"Multi-token":[81],"Policy":[82],"Gradient":[83],"Optimization":[84],"(MPO),":[85],"framework":[87],"that":[88,128],"treats":[89],"sequences":[90],"K":[92],"consecutive":[93],"unified":[96],"actions.":[98],"perspective":[101],"enables":[102],"our":[103],"method":[104],"to":[105,149],"compositional":[108],"trajectories":[112],"supports":[114],"over":[116],"coherent,":[117],"higher-level":[118],"objectives.":[119],"Experiments":[120],"on":[121],"mathematical":[122],"coding":[125],"benchmarks":[126],"show":[127],"MPO":[129],"outperforms":[130],"standard":[131],"policy":[133,141],"gradient":[134],"baselines,":[135],"highlight":[136],"limitations":[138],"gradients":[142],"reasoning,":[145],"motivating":[146],"future":[147],"research":[148],"look":[150],"beyond":[151],"granularity":[153],"reasoning-intensive":[155],"tasks.":[157]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-18T00:00:00"}
