{"id":"https://openalex.org/W7125776274","doi":"https://doi.org/10.48550/arxiv.2601.17828","title":"Aligning Medical Conversational AI through Online Reinforcement Learning with Information-Theoretic Rewards","display_name":"Aligning Medical Conversational AI through Online Reinforcement Learning with Information-Theoretic Rewards","publication_year":2026,"publication_date":"2026-01-25","ids":{"openalex":"https://openalex.org/W7125776274","doi":"https://doi.org/10.48550/arxiv.2601.17828"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2601.17828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2601.17828","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124029980","display_name":"Tanvi Verma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Verma, Tanvi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123899624","display_name":"Yang Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123921791","display_name":"Rick Siow Mong Goh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Goh, Rick Siow Mong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123922998","display_name":"Yong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5124029980"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.5162000060081482,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.5162000060081482,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.14409999549388885,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.13420000672340393,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6775000095367432},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.6050999760627747},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4869999885559082},{"id":"https://openalex.org/keywords/information-gain","display_name":"Information gain","score":0.4555000066757202},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.43779999017715454},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.4196999967098236}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7117000222206116},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6775000095367432},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.6050999760627747},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5992000102996826},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.48890000581741333},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4869999885559082},{"id":"https://openalex.org/C2983203078","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Information gain","level":2,"score":0.4555000066757202},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.43779999017715454},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C90329073","wikidata":"https://www.wikidata.org/wiki/Q914232","display_name":"Ask price","level":2,"score":0.34850001335144043},{"id":"https://openalex.org/C3019150057","wikidata":"https://www.wikidata.org/wiki/Q92779279","display_name":"Medical information","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3109000027179718},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C2777200299","wikidata":"https://www.wikidata.org/wiki/Q52943","display_name":"Conversation","level":2,"score":0.28999999165534973},{"id":"https://openalex.org/C2989236134","wikidata":"https://www.wikidata.org/wiki/Q31207","display_name":"Patient care","level":2,"score":0.27140000462532043}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2601.17828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2601.17828","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2601.17828","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.4820536971092224,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0,146],"present":[1],"Information":[2],"Gain":[3],"Fine-Tuning":[4],"(IGFT),":[5],"a":[6],"novel":[7],"approach":[8,131],"for":[9,223],"training":[10],"medical":[11,98,213,225],"conversational":[12],"AI":[13],"to":[14,45,71,135,169],"conduct":[15],"effective":[16,73],"patient":[17,125],"interviews":[18],"and":[19,97,127,153,190,201,211,218],"generate":[20],"comprehensive":[21],"History":[22],"of":[23,182],"Present":[24],"Illness":[25],"(HPI)":[26],"without":[27],"requiring":[28],"pre-collected":[29],"human":[30],"conversations.":[31,230],"IGFT":[32],"combines":[33],"online":[34,66],"Group":[35],"Relative":[36],"Policy":[37],"Optimization":[38],"(GRPO)":[39],"with":[40,50,116,172],"information-theoretic":[41],"rewards,":[42],"enabling":[43],"models":[44,70,133,149,205],"learn":[46,134],"from":[47],"self-generated":[48],"conversations":[49,61],"simulated":[51],"patients.":[52],"Unlike":[53],"existing":[54],"approaches":[55],"that":[56,87,141],"rely":[57],"on":[58,110,160,184,192,209],"expensive":[59],"expert-annotated":[60],"or":[62],"static":[63],"datasets,":[64],"our":[65],"RL":[67],"framework":[68],"allows":[69],"discover":[72],"questioning":[74],"strategies":[75],"through":[76],"exploration.":[77],"Our":[78],"key":[79],"innovation":[80],"is":[81,107],"an":[82],"information":[83,113],"gain":[84,114],"reward":[85,106],"function":[86],"tracks":[88],"which":[89,220],"clinical":[90,123],"entities":[91],"such":[92],"as":[93],"symptoms,":[94],"temporal":[95],"patterns,":[96],"history,":[99],"are":[100],"revealed":[101],"during":[102],"conversation.":[103],"Each":[104],"question's":[105],"computed":[108],"based":[109],"its":[111],"expected":[112],"combined":[115],"GPT-4o-mini":[117],"quality":[118],"assessments":[119],"across":[120],"dimensions":[121],"including":[122],"relevance,":[124],"engagement,":[126],"specificity.":[128],"This":[129],"hybrid":[130],"ensures":[132],"ask":[136],"targeted,":[137],"clinically":[138],"appropriate":[139],"questions":[140],"efficiently":[142],"gather":[143],"diagnostic":[144],"information.":[145],"fine-tune":[147],"two":[148],"using":[150],"LoRA:":[151],"Llama-3.1-8B-Instruct":[152,197],"DeepSeek-R1-Distill-Qwen-7B":[154,177],"(a":[155],"reasoning-optimized":[156],"model).":[157],"Training":[158],"exclusively":[159],"Avey":[161,185],"data":[162,171],"containing":[163],"concise":[164],"HPIs,":[165],"we":[166],"evaluate":[167],"generalization":[168],"MIMIC":[170,193,210],"longer,":[173],"more":[174],"elaborate":[175],"HPIs.":[176],"(IGFT)":[178,198],"achieves":[179],"F1":[180],"scores":[181],"0.408":[183],"(10.9%":[186],"improvement":[187],"over":[188],"base)":[189],"0.289":[191],"(12.9%":[194],"improvement),":[195],"while":[196],"reaches":[199],"0.384":[200],"0.336":[202],"respectively.":[203],"Both":[204],"outperform":[206],"OpenAI's":[207],"model":[208],"surpass":[212],"domain-specific":[214],"baselines":[215],"like":[216],"HuatuoGPT":[217],"UltraMedical,":[219],"were":[221],"optimized":[222],"single-turn":[224],"QA":[226],"rather":[227],"than":[228],"multi-turn":[229]},"counts_by_year":[],"updated_date":"2026-01-28T23:18:48.515280","created_date":"2026-01-28T00:00:00"}
