{"id":"https://openalex.org/W7140208265","doi":"https://doi.org/10.48550/arxiv.2603.21563","title":"Counterfactual Credit Policy Optimization for Multi-Agent Collaboration","display_name":"Counterfactual Credit Policy Optimization for Multi-Agent Collaboration","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140208265","doi":"https://doi.org/10.48550/arxiv.2603.21563"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.21563","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21563","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.21563","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Li, Zhongyi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Li, Zhongyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tian, Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Wan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ban, Yikun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ban, Yikun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Chen, Jinju","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jinju","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Huiming","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Huiming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Liu, Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Zhuang, Fuzhen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuang, Fuzhen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.21119999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.21119999885559082,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.133200004696846,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.09260000288486481,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/counterfactual-thinking","display_name":"Counterfactual thinking","score":0.9532999992370605},{"id":"https://openalex.org/keywords/earned-income-tax-credit","display_name":"Earned income tax credit","score":0.6202999949455261},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6158000230789185},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.4131999909877777},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.4083000123500824},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.3806000053882599},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.3357999920845032}],"concepts":[{"id":"https://openalex.org/C108650721","wikidata":"https://www.wikidata.org/wiki/Q1783253","display_name":"Counterfactual thinking","level":2,"score":0.9532999992370605},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7233999967575073},{"id":"https://openalex.org/C2777281377","wikidata":"https://www.wikidata.org/wiki/Q1036463","display_name":"Earned income tax credit","level":3,"score":0.6202999949455261},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6158000230789185},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.4131999909877777},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.4106999933719635},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.4083000123500824},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3806000053882599},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35839998722076416},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3483000099658966},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3357999920845032},{"id":"https://openalex.org/C2777716012","wikidata":"https://www.wikidata.org/wiki/Q5318389","display_name":"Dyad","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3172000050544739},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.3158999979496002},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.27970001101493835},{"id":"https://openalex.org/C71889745","wikidata":"https://www.wikidata.org/wiki/Q1783264","display_name":"Counterfactual conditional","level":3,"score":0.2721000015735626},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.27090001106262207},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.262800008058548},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.26260000467300415},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.21563","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21563","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.21563","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.21563","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/17","score":0.4207945466041565,"display_name":"Partnerships for the goals"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Collaborative":[0],"multi-agent":[1,121,135],"large":[2],"language":[3],"models":[4],"(LLMs)":[5],"can":[6],"solve":[7],"complex":[8],"reasoning":[9,127],"tasks":[10,92],"by":[11,28,58],"decomposing":[12],"roles":[13],"and":[14,41,93,120,125,132,140],"aggregating":[15],"diverse":[16],"hypotheses.":[17],"Yet,":[18],"reinforcement":[19],"learning":[20,56],"(RL)":[21],"for":[22,83,145],"such":[23],"systems":[24],"is":[25,151],"often":[26],"undermined":[27],"credit":[29,143],"assignment:":[30],"a":[31,51,98,116],"shared":[32],"global":[33,106],"reward":[34],"obscures":[35],"individual":[36],"contributions,":[37],"inflating":[38],"update":[39],"variance":[40],"encouraging":[42],"free-riding.":[43],"We":[44,109],"introduce":[45],"Counterfactual":[46],"Credit":[47],"Policy":[48],"Optimization":[49],"(CCPO),":[50],"framework":[52],"that":[53,72,102],"assigns":[54],"agent-specific":[55],"signals":[57],"estimating":[59],"each":[60],"agent's":[61,77],"marginal":[62],"contribution":[63,78],"through":[64],"counterfactual":[65,70],"trajectories.":[66],"CCPO":[67,111,129],"builds":[68],"dynamic":[69],"baselines":[71],"simulate":[73],"outcomes":[74],"with":[75],"an":[76],"removed,":[79],"yielding":[80,138],"role-sensitive":[81],"advantages":[82,104],"policy":[84],"optimization.":[85],"To":[86],"further":[87],"improve":[88],"stability":[89],"under":[90],"heterogeneous":[91],"data":[94],"distributions,":[95],"we":[96],"propose":[97],"global-history-aware":[99],"normalization":[100],"scheme":[101],"calibrates":[103],"using":[105],"rollout":[107],"statistics.":[108],"evaluate":[110],"on":[112],"two":[113],"collaboration":[114],"topologies:":[115],"sequential":[117],"Think--Reason":[118],"dyad":[119],"voting.":[122],"Across":[123],"mathematical":[124],"logical":[126],"benchmarks,":[128],"mitigates":[130],"free-riding":[131],"outperforms":[133],"strong":[134],"RL":[136],"baselines,":[137],"finer-grained":[139],"more":[141],"effective":[142],"assignment":[144],"collaborative":[146],"LLM":[147],"training.":[148],"Our":[149],"code":[150],"available":[152],"at":[153],"https://github.com/bhai114/ccpo.":[154]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
