{"id":"https://openalex.org/W7134077109","doi":"https://doi.org/10.48550/arxiv.2603.04743","title":"DARE: Aligning LLM Agents with the R Statistical Ecosystem via Distribution-Aware Retrieval","display_name":"DARE: Aligning LLM Agents with the R Statistical Ecosystem via Distribution-Aware Retrieval","publication_year":2026,"publication_date":"2026-03-05","ids":{"openalex":"https://openalex.org/W7134077109","doi":"https://doi.org/10.48550/arxiv.2603.04743"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.04743","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017506044","display_name":"Maojun Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sun, Maojun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128268625","display_name":"Yue Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036964889","display_name":"Yifei Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Yifei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028504100","display_name":"Ruijian Han","orcid":"https://orcid.org/0000-0002-9225-2218"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Ruijian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123463780","display_name":"Binyan Jiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Binyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123474974","display_name":"Defeng Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Defeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128274140","display_name":"Yancheng Yuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yuan, Yancheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128221889","display_name":"Jian Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jian","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5017506044"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.2076999992132187,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.2076999992132187,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.18770000338554382,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.1867000013589859,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.5947999954223633},{"id":"https://openalex.org/keywords/statistical-model","display_name":"Statistical model","score":0.5493999719619751},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4997999966144562},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.4927000105381012},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.43560001254081726},{"id":"https://openalex.org/keywords/function","display_name":"Function (biology)","score":0.43389999866485596},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4278999865055084},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4242999851703644},{"id":"https://openalex.org/keywords/knowledge-base","display_name":"Knowledge base","score":0.37689998745918274}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7731000185012817},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.5947999954223633},{"id":"https://openalex.org/C114289077","wikidata":"https://www.wikidata.org/wiki/Q3284399","display_name":"Statistical model","level":2,"score":0.5493999719619751},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4997999966144562},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.49810001254081726},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.4927000105381012},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.43560001254081726},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.43389999866485596},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4278999865055084},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4242999851703644},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.41200000047683716},{"id":"https://openalex.org/C4554734","wikidata":"https://www.wikidata.org/wiki/Q593744","display_name":"Knowledge base","level":2,"score":0.37689998745918274},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.35929998755455017},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.3310999870300293},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3276999890804291},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.319599986076355},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.29910001158714294},{"id":"https://openalex.org/C89686163","wikidata":"https://www.wikidata.org/wiki/Q1187982","display_name":"Vector space model","level":2,"score":0.29660001397132874},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2867000102996826},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.2752000093460083},{"id":"https://openalex.org/C161156560","wikidata":"https://www.wikidata.org/wiki/Q1638872","display_name":"Document retrieval","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C9357733","wikidata":"https://www.wikidata.org/wiki/Q6878417","display_name":"Missing data","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.26339998841285706},{"id":"https://openalex.org/C2986587452","wikidata":"https://www.wikidata.org/wiki/Q938438","display_name":"Statistical analysis","level":2,"score":0.2572000026702881},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.04743","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.04743","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.04743","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.04743","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/15","score":0.42992573976516724,"display_name":"Life in Land"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)":[3],"agents":[4,122],"can":[5],"automate":[6],"data-science":[7],"workflows,":[8],"but":[9],"many":[10],"rigorous":[11],"statistical":[12,23,115,177],"methods":[13],"implemented":[14],"in":[15,123],"R":[16,62,73,108,176],"remain":[17],"underused":[18],"because":[19],"LLMs":[20],"struggle":[21],"with":[22,92],"knowledge":[24],"and":[25,35,99,111,173],"tool":[26],"retrieval.":[27,64],"Existing":[28],"retrieval-augmented":[29],"approaches":[30],"focus":[31],"on":[32,145,160],"function-level":[33],"semantics":[34],"ignore":[36],"data":[37,55],"distribution,":[38],"producing":[39],"suboptimal":[40],"matches.":[41],"We":[42],"propose":[43],"DARE":[44,128,154],"(Distribution-Aware":[45],"Retrieval":[46],"Embedding),":[47],"a":[48,71,112],"lightweight,":[49],"plug-and-play":[50],"retrieval":[51,97,147],"model":[52,87],"that":[53,88],"incorporates":[54],"distribution":[56],"information":[57],"into":[58,155],"function":[59,93],"representations":[60],"for":[61,106,118],"package":[63,146],"Our":[65],"main":[66],"contributions":[67],"are:":[68],"(i)":[69],"RPKB,":[70],"curated":[72],"Package":[74],"Knowledge":[75],"Base":[76],"derived":[77],"from":[78],"8,191":[79],"high-quality":[80],"CRAN":[81],"packages;":[82],"(ii)":[83],"DARE,":[84],"an":[85,102,130],"embedding":[86,139],"fuses":[89],"distributional":[90],"features":[91],"metadata":[94],"to":[95,143],"improve":[96],"relevance;":[98],"(iii)":[100],"RCodingAgent,":[101],"R-oriented":[103],"LLM":[104,121,171],"agent":[105],"reliable":[107],"code":[109],"generation":[110],"suite":[113],"of":[114,134],"analysis":[116,162],"tasks":[117],"systematically":[119],"evaluating":[120],"realistic":[124],"analytical":[125],"scenarios.":[126],"Empirically,":[127],"achieves":[129],"NDCG":[131],"at":[132],"10":[133],"93.47%,":[135],"outperforming":[136],"state-of-the-art":[137],"open-source":[138],"models":[140],"by":[141],"up":[142],"17%":[144],"while":[148],"using":[149],"substantially":[150],"fewer":[151],"parameters.":[152],"Integrating":[153],"RCodingAgent":[156],"yields":[157],"significant":[158],"gains":[159],"downstream":[161],"tasks.":[163],"This":[164],"work":[165],"helps":[166],"narrow":[167],"the":[168,174],"gap":[169],"between":[170],"automation":[172],"mature":[175],"ecosystem.":[178]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-07T00:00:00"}
