{"id":"https://openalex.org/W4413393703","doi":"https://doi.org/10.23919/acc63710.2025.11107724","title":"Enhancing Autonomous Driving Policy Stability through Auxiliary Network in Reinforcement Learning from Human Feedback","display_name":"Enhancing Autonomous Driving Policy Stability through Auxiliary Network in Reinforcement Learning from Human Feedback","publication_year":2025,"publication_date":"2025-07-08","ids":{"openalex":"https://openalex.org/W4413393703","doi":"https://doi.org/10.23919/acc63710.2025.11107724"},"language":"en","primary_location":{"id":"doi:10.23919/acc63710.2025.11107724","is_oa":false,"landing_page_url":"https://doi.org/10.23919/acc63710.2025.11107724","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 American Control Conference (ACC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102525994","display_name":"Hengcong Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hengcong Guo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100304260","display_name":"Junfeng Zhao","orcid":"https://orcid.org/0009-0001-2846-9981"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junfeng Zhao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5102525994"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.3185,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.83475989,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"2998","last_page":"3003"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10524","display_name":"Traffic control and management","score":0.8094000220298767,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10524","display_name":"Traffic control and management","score":0.8094000220298767,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11942","display_name":"Transportation and Mobility Innovations","score":0.8037999868392944,"subfield":{"id":"https://openalex.org/subfields/2203","display_name":"Automotive Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.7905252575874329},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.6186015605926514},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6005702018737793},{"id":"https://openalex.org/keywords/policy-learning","display_name":"Policy learning","score":0.5052497982978821},{"id":"https://openalex.org/keywords/reinforcement","display_name":"Reinforcement","score":0.4358293116092682},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3960551619529724},{"id":"https://openalex.org/keywords/control-theory","display_name":"Control theory (sociology)","score":0.3321874737739563},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.27638545632362366},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.1684403121471405},{"id":"https://openalex.org/keywords/control","display_name":"Control (management)","score":0.12948274612426758},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.12411916255950928},{"id":"https://openalex.org/keywords/social-psychology","display_name":"Social psychology","score":0.11820900440216064}],"concepts":[{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.7905252575874329},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.6186015605926514},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6005702018737793},{"id":"https://openalex.org/C2779436431","wikidata":"https://www.wikidata.org/wiki/Q30672407","display_name":"Policy learning","level":2,"score":0.5052497982978821},{"id":"https://openalex.org/C67203356","wikidata":"https://www.wikidata.org/wiki/Q1321905","display_name":"Reinforcement","level":2,"score":0.4358293116092682},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3960551619529724},{"id":"https://openalex.org/C47446073","wikidata":"https://www.wikidata.org/wiki/Q5165890","display_name":"Control theory (sociology)","level":3,"score":0.3321874737739563},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27638545632362366},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.1684403121471405},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.12948274612426758},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.12411916255950928},{"id":"https://openalex.org/C77805123","wikidata":"https://www.wikidata.org/wiki/Q161272","display_name":"Social psychology","level":1,"score":0.11820900440216064}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.23919/acc63710.2025.11107724","is_oa":false,"landing_page_url":"https://doi.org/10.23919/acc63710.2025.11107724","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 American Control Conference (ACC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Reduced inequalities","score":0.4000000059604645,"id":"https://metadata.un.org/sdg/10"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2131600418","https://openalex.org/W2788862220","https://openalex.org/W2793939502","https://openalex.org/W2897470356","https://openalex.org/W2973229164","https://openalex.org/W2997985982","https://openalex.org/W3130288902","https://openalex.org/W3200694950","https://openalex.org/W4221118182","https://openalex.org/W4226257065","https://openalex.org/W4286253093","https://openalex.org/W4288068940","https://openalex.org/W4386702654","https://openalex.org/W4388265616","https://openalex.org/W4389249986","https://openalex.org/W4394616143","https://openalex.org/W4396758988","https://openalex.org/W4398775595","https://openalex.org/W4400057307","https://openalex.org/W4406736174","https://openalex.org/W6884318772"],"related_works":["https://openalex.org/W4310083477","https://openalex.org/W2328553770","https://openalex.org/W2920061524","https://openalex.org/W1977959518","https://openalex.org/W2038908348","https://openalex.org/W2107890255","https://openalex.org/W2106552856","https://openalex.org/W2145821588","https://openalex.org/W2086122291","https://openalex.org/W1987513656"],"abstract_inverted_index":{"Reinforcement":[0],"learning":[1,34,122],"from":[2],"human":[3,30,90],"feedback":[4,31],"(RLHF)":[5],"has":[6],"gained":[7],"increasing":[8],"attention":[9],"in":[10,47,160,176],"automated":[11,70],"vehicle":[12],"planning":[13],"and":[14,24,43,49,174],"control":[15],"due":[16],"to":[17,20,39,65,111,148],"its":[18],"potential":[19,157],"enhance":[21],"decision-making":[22,163],"processes":[23],"accelerate":[25],"policy":[26,118,145],"optimization.":[27],"By":[28],"incorporating":[29],"into":[32],"reinforcement":[33],"models,":[35],"RLHF":[36],"enables":[37],"agents":[38],"develop":[40],"more":[41],"reliable":[42],"context-aware":[44],"behaviors,":[45],"particularly":[46],"complex":[48],"dynamic":[50],"traffic":[51],"environments.":[52],"This":[53,93],"paper":[54],"presents":[55],"PVP":[56,151],"with":[57],"Auxiliary":[58],"Network":[59],"(aPVP),":[60],"an":[61,84],"RLHF-based":[62,162],"framework":[63,81],"designed":[64],"improve":[66],"the":[67,76,108,126,140,149,156,166],"stability":[68,119,146],"of":[69,128,158],"driving":[71,91,179],"policies.":[72],"Specifically,":[73],"we":[74,130],"extend":[75],"Proxy":[77],"Value":[78],"Propagation":[79],"(PVP)":[80],"by":[82],"introducing":[83],"auxiliary":[85,94],"neural":[86],"network":[87,110],"trained":[88],"on":[89,171],"data.":[92],"model":[95],"serves":[96],"as":[97],"a":[98,102,114,132],"virtual":[99],"driver,":[100],"providing":[101],"similarity-based":[103],"loss":[104],"function":[105],"that":[106,139],"guides":[107],"actor":[109],"explore":[112],"within":[113],"reasonable":[115],"range,":[116],"ensuring":[117],"while":[120],"preserving":[121],"flexibility.":[123],"To":[124],"validate":[125],"effectiveness":[127],"aPVP,":[129],"design":[131],"comprehensive":[133],"experimental":[134],"setup.":[135],"Empirical":[136],"results":[137],"demonstrate":[138],"proposed":[141],"approach":[142],"significantly":[143],"enhances":[144],"compared":[147],"original":[150],"framework.":[152],"These":[153],"findings":[154],"highlight":[155],"aPVP":[159],"improving":[161],"systems,":[164],"paving":[165],"way":[167],"for":[168],"future":[169],"research":[170],"enhancing":[172],"scalability":[173],"adaptability":[175],"real-world":[177],"autonomous":[178],"scenarios.":[180]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-13T16:22:10.518609","created_date":"2025-10-10T00:00:00"}