{"id":"https://openalex.org/W7140303640","doi":"https://doi.org/10.48550/arxiv.2603.23232","title":"GEM: Guided Expectation-Maximization for Behavior-Normalized Candidate Action Selection in Offline RL","display_name":"GEM: Guided Expectation-Maximization for Behavior-Normalized Candidate Action Selection in Offline RL","publication_year":2026,"publication_date":"2026-03-24","ids":{"openalex":"https://openalex.org/W7140303640","doi":"https://doi.org/10.48550/arxiv.2603.23232"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.23232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.23232","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130575137","display_name":"Haoyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Haoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130565517","display_name":"Jingcheng Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jingcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087893108","display_name":"Shunyu Wu","orcid":"https://orcid.org/0000-0001-9856-2148"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Shunyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5130554354","display_name":"Xinwei Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao, Xinwei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5130575137"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.671500027179718,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.671500027179718,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.06880000233650208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.06780000030994415,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.5724999904632568},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5652999877929688},{"id":"https://openalex.org/keywords/action-selection","display_name":"Action selection","score":0.5533000230789185},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5069000124931335},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.46470001339912415},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.4325000047683716},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.35100001096725464},{"id":"https://openalex.org/keywords/gaussian-process","display_name":"Gaussian process","score":0.34540000557899475}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7390999794006348},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6008999943733215},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.5724999904632568},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5652999877929688},{"id":"https://openalex.org/C166109690","wikidata":"https://www.wikidata.org/wiki/Q4677422","display_name":"Action selection","level":3,"score":0.5533000230789185},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5069000124931335},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4699000120162964},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.46470001339912415},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.35100001096725464},{"id":"https://openalex.org/C61326573","wikidata":"https://www.wikidata.org/wiki/Q1496376","display_name":"Gaussian process","level":3,"score":0.34540000557899475},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.32010000944137573},{"id":"https://openalex.org/C2992734406","wikidata":"https://www.wikidata.org/wiki/Q413267","display_name":"One shot","level":2,"score":0.3181999921798706},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.30709999799728394},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C93959086","wikidata":"https://www.wikidata.org/wiki/Q6888345","display_name":"Model selection","level":2,"score":0.2922999858856201},{"id":"https://openalex.org/C134121241","wikidata":"https://www.wikidata.org/wiki/Q899301","display_name":"Yield (engineering)","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.28299999237060547},{"id":"https://openalex.org/C77553402","wikidata":"https://www.wikidata.org/wiki/Q13222579","display_name":"Upper and lower bounds","level":2,"score":0.2824000120162964},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2793000042438507},{"id":"https://openalex.org/C151637689","wikidata":"https://www.wikidata.org/wiki/Q5318064","display_name":"Dwell time","level":2,"score":0.27570000290870667},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.2702000141143799},{"id":"https://openalex.org/C44000306","wikidata":"https://www.wikidata.org/wiki/Q244330","display_name":"Hinge","level":2,"score":0.25870001316070557}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.23232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.23232","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.23232","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.6859531998634338}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Offline":[0],"reinforcement":[1],"learning":[2],"(RL)":[3],"can":[4,39],"fit":[5],"strong":[6,59],"value":[7],"functions":[8],"from":[9],"fixed":[10],"datasets,":[11],"yet":[12],"reliable":[13],"deployment":[14],"still":[15],"hinges":[16],"on":[17],"the":[18,27,138],"action":[19,34,71],"selection":[20,72],"interface":[21],"used":[22],"to":[23,109,148],"query":[24],"them.":[25],"When":[26],"dataset":[28],"induces":[29],"a":[30,58,80,104,120,128,167],"branched":[31],"or":[32],"multimodal":[33,74],"landscape,":[35],"unimodal":[36],"policy":[37],"extraction":[38],"blur":[40],"competing":[41],"hypotheses":[42],"and":[43,75,102,124,155,165],"yield":[44,149],"\"in-between\"":[45],"actions":[46,126],"that":[47,69,91,174],"are":[48],"weakly":[49],"supported":[50],"by":[51],"data,":[52],"making":[53],"decisions":[54],"brittle":[55],"even":[56],"with":[57,134],"critic.":[60],"We":[61],"introduce":[62],"GEM":[63,78,114,159],"(Guided":[64],"Expectation-Maximization),":[65],"an":[66],"analytical":[67],"framework":[68],"makes":[70],"both":[73],"explicitly":[76],"controllable.":[77],"trains":[79],"Gaussian":[81],"Mixture":[82],"Model":[83],"(GMM)":[84],"actor":[85],"via":[86],"critic-guided,":[87],"advantage-weighted":[88],"EM-style":[89],"updates":[90],"preserve":[92],"distinct":[93],"components":[94],"while":[95],"shifting":[96],"probability":[97],"mass":[98],"toward":[99],"high-value":[100],"regions,":[101],"learns":[103],"tractable":[105],"GMM":[106],"behavior":[107,139],"model":[108],"quantify":[110],"support.":[111],"During":[112],"inference,":[113],"performs":[115],"candidate-based":[116],"selection:":[117],"it":[118],"generates":[119],"parallel":[121],"candidate":[122,146,156],"set":[123,147],"reranks":[125],"using":[127],"conservative":[129],"ensemble":[130],"lower-confidence":[131],"bound":[132],"together":[133],"behavior-normalized":[135],"support,":[136],"where":[137],"log-likelihood":[140],"is":[141,160],"standardized":[142],"within":[143],"each":[144],"state's":[145],"stable,":[150],"comparable":[151],"control":[152],"across":[153,162],"states":[154],"budgets.":[157],"Empirically,":[158],"competitive":[161],"D4RL":[163],"benchmarks,":[164],"offers":[166],"simple":[168],"inference-time":[169],"budget":[170],"knob":[171],"(candidate":[172],"count)":[173],"trades":[175],"compute":[176],"for":[177],"decision":[178],"quality":[179],"without":[180],"retraining.":[181]},"counts_by_year":[],"updated_date":"2026-03-26T06:10:45.909354","created_date":"2026-03-26T00:00:00"}
