{"id":"https://openalex.org/W4406110575","doi":"https://doi.org/10.1093/comjnl/bxae138","title":"Unbiased training framework on deep reinforcement learning","display_name":"Unbiased training framework on deep reinforcement learning","publication_year":2025,"publication_date":"2025-01-05","ids":{"openalex":"https://openalex.org/W4406110575","doi":"https://doi.org/10.1093/comjnl/bxae138"},"language":"en","primary_location":{"id":"doi:10.1093/comjnl/bxae138","is_oa":false,"landing_page_url":"https://doi.org/10.1093/comjnl/bxae138","pdf_url":null,"source":{"id":"https://openalex.org/S44643521","display_name":"The Computer Journal","issn_l":"0010-4620","issn":["0010-4620","1460-2067"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Computer Journal","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100651507","display_name":"Huihui Zhang","orcid":"https://orcid.org/0000-0002-1012-8089"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Huihui Zhang","raw_affiliation_strings":["Department of Electrical and Electronic Engineering, Tsinghua University , Beijing 100084 ,","Department of Electrical and Electronic Engineering, Tsinghua University, Beijing 100084, China"],"affiliations":[{"raw_affiliation_string":"Department of Electrical and Electronic Engineering, Tsinghua University , Beijing 100084 ,","institution_ids":["https://openalex.org/I99065089"]},{"raw_affiliation_string":"Department of Electrical and Electronic Engineering, Tsinghua University, Beijing 100084, China","institution_ids":["https://openalex.org/I99065089"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5100651507"],"corresponding_institution_ids":["https://openalex.org/I99065089"],"apc_list":{"value":2635,"currency":"GBP","value_usd":3232},"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.00162401,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"68","issue":"6","first_page":"649","last_page":"662"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12794","display_name":"Adaptive Dynamic Programming Control","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11975","display_name":"Evolutionary Algorithms and Applications","score":0.9850999712944031,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.8554472327232361},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8054589033126831},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5846672058105469},{"id":"https://openalex.org/keywords/backup","display_name":"Backup","score":0.569113552570343},{"id":"https://openalex.org/keywords/tuple","display_name":"Tuple","score":0.467719703912735},{"id":"https://openalex.org/keywords/hyperparameter","display_name":"Hyperparameter","score":0.4607267379760742},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4582715332508087},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.4170069098472595},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.39393067359924316},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.08983975648880005}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.8554472327232361},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8054589033126831},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5846672058105469},{"id":"https://openalex.org/C2780945871","wikidata":"https://www.wikidata.org/wiki/Q194274","display_name":"Backup","level":2,"score":0.569113552570343},{"id":"https://openalex.org/C118930307","wikidata":"https://www.wikidata.org/wiki/Q600590","display_name":"Tuple","level":2,"score":0.467719703912735},{"id":"https://openalex.org/C8642999","wikidata":"https://www.wikidata.org/wiki/Q4171168","display_name":"Hyperparameter","level":2,"score":0.4607267379760742},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4582715332508087},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.4170069098472595},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39393067359924316},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.08983975648880005},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.0},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1093/comjnl/bxae138","is_oa":false,"landing_page_url":"https://doi.org/10.1093/comjnl/bxae138","pdf_url":null,"source":{"id":"https://openalex.org/S44643521","display_name":"The Computer Journal","issn_l":"0010-4620","issn":["0010-4620","1460-2067"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310311648","host_organization_name":"Oxford University Press","host_organization_lineage":["https://openalex.org/P4310311648","https://openalex.org/P4310311647"],"host_organization_lineage_names":["Oxford University Press","University of Oxford"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The Computer Journal","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W2100495367","https://openalex.org/W2101355568","https://openalex.org/W2101786389","https://openalex.org/W2119380668","https://openalex.org/W2127412976","https://openalex.org/W2131600418","https://openalex.org/W2145339207","https://openalex.org/W2150339816","https://openalex.org/W2152083440","https://openalex.org/W2175259684","https://openalex.org/W2592798481","https://openalex.org/W2612690371","https://openalex.org/W2736601468","https://openalex.org/W2749928749","https://openalex.org/W2766447205","https://openalex.org/W2787938642","https://openalex.org/W2806832624","https://openalex.org/W2913022628","https://openalex.org/W2962802563","https://openalex.org/W2963095800","https://openalex.org/W2963864421","https://openalex.org/W3081391704","https://openalex.org/W4231109964","https://openalex.org/W4289543315","https://openalex.org/W4298101950","https://openalex.org/W4299401133","https://openalex.org/W6638018090","https://openalex.org/W6683204974","https://openalex.org/W6684191040","https://openalex.org/W6684921986","https://openalex.org/W6689664320","https://openalex.org/W6692846177","https://openalex.org/W6704084210","https://openalex.org/W6730111887","https://openalex.org/W6734678876","https://openalex.org/W6735579001","https://openalex.org/W6741002519","https://openalex.org/W6747473740","https://openalex.org/W6748638692","https://openalex.org/W6748839928","https://openalex.org/W6754610480","https://openalex.org/W6758797660","https://openalex.org/W6773029903"],"related_works":["https://openalex.org/W2955195711","https://openalex.org/W2354454611","https://openalex.org/W2993266126","https://openalex.org/W2392283887","https://openalex.org/W2351388597","https://openalex.org/W2939925694","https://openalex.org/W2829881200","https://openalex.org/W4241986464","https://openalex.org/W2184647741","https://openalex.org/W1985525502"],"abstract_inverted_index":{"Abstract":[0],"In":[1],"deep":[2],"reinforcement":[3],"learning":[4],"(DRL),":[5],"bias":[6],"is":[7,150],"systematic":[8],"in":[9,25,89],"asynchronous":[10],"training":[11],"due":[12],"to":[13,63,74,82,96,114,137,193],"different":[14,17,139],"state":[15,88],"distributions,":[16],"policies":[18],"and":[19,38,43,55,66,119,131,174,198],"lacking":[20],"knowledge":[21],"of":[22,33,78,148,161,172,181],"transition":[23,80,91],"probability":[24,170],"model-free":[26],"learning.":[27],"Therefore,":[28],"we":[29,59],"bring":[30],"the":[31,83,86,90,99,116,151,168,175,179],"notions":[32],"parallel":[34,50,61],"executors,":[35,117],"shared":[36,69],"actor":[37,118],"central":[39,72,106],"critic":[40,120],"into":[41],"DRL,":[42],"propose":[44,185],"a":[45,68,76,105],"general":[46],"framework":[47,192],"that":[48],"enables":[49],"collecting,":[51],"unbiased":[52,159],"data":[53],"processing":[54],"centralized":[56],"training.":[57],"Specifically,":[58],"employ":[60],"executors":[62],"obtain":[64],"observations,":[65],"follow":[67],"policy":[70],"from":[71],"thread":[73],"pass":[75],"batch":[77],"four-tuple":[79],"slots":[81,92],"critic.":[84],"Simultaneously,":[85],"next":[87],"are":[93,102],"fed":[94],"back":[95],"executors.":[97],"Then,":[98],"network":[100],"parameters":[101],"updated":[103],"by":[104,141],"learner.":[107],"A":[108],"backup":[109],"storage":[110],"can":[111,134,157],"be":[112,135],"adopted":[113],"make":[115],"work":[121],"concurrently.":[122],"There":[123],"exists":[124],"two":[125],"working":[126],"modes":[127],"for":[128],"our":[129,190],"framework,":[130],"several":[132,186],"variants":[133,149],"achieved":[136],"suit":[138],"environments":[140],"tuning":[142],"some":[143],"hyperparameters.":[144],"One":[145],"special":[146],"case":[147,156],"existing":[152],"DRL.":[153],"Another":[154],"extreme":[155],"produce":[158],"estimation":[160,165],"loss":[162],"function":[163],"whose":[164],"exactly":[166],"matches":[167],"joint":[169],"distribution":[171],"observations":[173],"policy,":[176],"thus":[177],"avoiding":[178],"instability":[180],"importance":[182],"sampling.":[183],"We":[184],"efficient":[187],"algorithms":[188],"under":[189],"new":[191],"deal":[194],"with":[195],"typical":[196],"discrete":[197],"continuous":[199],"scenarios.":[200]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
