{"id":"https://openalex.org/W4403853601","doi":"https://doi.org/10.48550/arxiv.2409.03650","title":"On the Limited Generalization Capability of the Implicit Reward Model Induced by Direct Preference Optimization","display_name":"On the Limited Generalization Capability of the Implicit Reward Model Induced by Direct Preference Optimization","publication_year":2024,"publication_date":"2024-09-05","ids":{"openalex":"https://openalex.org/W4403853601","doi":"https://doi.org/10.48550/arxiv.2409.03650"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2409.03650","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03650","pdf_url":"https://arxiv.org/pdf/2409.03650","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2409.03650","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101784720","display_name":"Lin Yong","orcid":"https://orcid.org/0000-0002-0574-3837"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lin, Yong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059839283","display_name":"Skyler Seto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seto, Skyler","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017591085","display_name":"Maartje ter Hoeve","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"ter Hoeve, Maartje","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045884826","display_name":"Katherine Metcalf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Metcalf, Katherine","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112911728","display_name":"Barry-John Theobald","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Theobald, Barry-John","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100328977","display_name":"Xuan Wang","orcid":"https://orcid.org/0000-0001-9183-3080"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101958998","display_name":"Yizhe Zhang","orcid":"https://orcid.org/0000-0001-7300-2400"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yizhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080093325","display_name":"Chen Huang","orcid":"https://orcid.org/0000-0002-5765-334X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100378750","display_name":"Tong Zhang","orcid":"https://orcid.org/0000-0001-5818-4285"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5101784720"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10050","display_name":"Multi-Criteria Decision Making","score":0.47279998660087585,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10050","display_name":"Multi-Criteria Decision Making","score":0.47279998660087585,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.8291563987731934},{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.8069304823875427},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4167136549949646},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.3692700266838074},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.33418190479278564},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3121543526649475},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.1529584527015686}],"concepts":[{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.8291563987731934},{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.8069304823875427},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4167136549949646},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.3692700266838074},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.33418190479278564},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3121543526649475},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.1529584527015686},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2409.03650","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03650","pdf_url":"https://arxiv.org/pdf/2409.03650","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2409.03650","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2409.03650","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2409.03650","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2409.03650","pdf_url":"https://arxiv.org/pdf/2409.03650","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4403853601.pdf","grobid_xml":"https://content.openalex.org/works/W4403853601.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W4391375266","https://openalex.org/W1979597421","https://openalex.org/W2007980826","https://openalex.org/W2061531152","https://openalex.org/W3002753104","https://openalex.org/W2077600819","https://openalex.org/W2142036596","https://openalex.org/W2072657027","https://openalex.org/W2600246793"],"abstract_inverted_index":{"Reinforcement":[0],"Learning":[1],"from":[2,55],"Human":[3],"Feedback":[4],"(RLHF)":[5],"is":[6,20,111],"an":[7,40,51,82,197],"effective":[8],"approach":[9],"for":[10,25,32,102,133],"aligning":[11],"language":[12],"models":[13],"to":[14,18],"human":[15,27],"preferences.":[16,28],"Central":[17],"RLHF":[19],"learning":[21,33],"a":[22,34,170,178],"reward":[23,35,53,73,199],"function":[24],"scoring":[26],"Two":[29],"main":[30],"approaches":[31],"model":[36,74,200],"are":[37],"1)":[38],"training":[39,147],"EXplicit":[41],"Reward":[42],"Model":[43],"(EXRM)":[44],"as":[45,61,78],"in":[46,84,173,201],"RLHF,":[47],"and":[48,97,130,136,177,192],"2)":[49],"using":[50],"implicit":[52,72],"learned":[54,95],"preference":[56],"data":[57],"through":[58],"methods":[59,105],"such":[60],"Direct":[62],"Preference":[63],"Optimization":[64],"(DPO).":[65],"Prior":[66],"work":[67,123],"has":[68,99,169,188],"shown":[69],"that":[70,141,186],"the":[71,85,91,94,118,125,146,158,194],"of":[75,93,120,175,181,196],"DPO":[76,203],"(denoted":[77],"DPORM)":[79],"can":[80],"approximate":[81],"EXRM":[83],"limit.":[86],"DPORM's":[87],"effectiveness":[88],"directly":[89],"implies":[90],"optimality":[92],"policy,":[96],"also":[98],"practical":[100],"implication":[101],"LLM":[103],"alignment":[104],"including":[106],"iterative":[107,202],"DPO.":[108],"However,":[109],"it":[110,150],"unclear":[112],"how":[113],"well":[114],"DPORM":[115,135,144,168,187],"empirically":[116],"matches":[117],"performance":[119],"EXRM.":[121,137],"This":[122],"studies":[124],"accuracy":[126,174],"at":[127],"distinguishing":[128],"preferred":[129],"rejected":[131],"answers":[132],"both":[134],"Our":[138],"findings":[139,184],"indicate":[140],"even":[142],"though":[143],"fits":[145],"dataset":[148],"comparably,":[149],"generalizes":[151],"less":[152],"effectively":[153],"than":[154],"EXRM,":[155],"especially":[156],"when":[157],"validation":[159],"datasets":[160],"contain":[161],"distribution":[162],"shifts.":[163],"Across":[164],"five":[165],"out-of-distribution":[166],"settings,":[167],"mean":[171],"drop":[172,180],"3%":[176],"maximum":[179],"7%.":[182],"These":[183],"highlight":[185],"limited":[189],"generalization":[190],"ability":[191],"substantiates":[193],"integration":[195],"explicit":[198],"approaches.":[204]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2024-10-29T00:00:00"}
