{"id":"https://openalex.org/W7130713371","doi":"https://doi.org/10.48550/arxiv.2602.17215","title":"NotebookRAG: Retrieving Multiple Notebooks to Augment the Generation of EDA Notebooks for Crowd-Wisdom","display_name":"NotebookRAG: Retrieving Multiple Notebooks to Augment the Generation of EDA Notebooks for Crowd-Wisdom","publication_year":2026,"publication_date":"2026-02-19","ids":{"openalex":"https://openalex.org/W7130713371","doi":"https://doi.org/10.48550/arxiv.2602.17215"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.17215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.17215","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126460059","display_name":"Yi Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Yi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124976168","display_name":"Yixuan He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yixuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126475150","display_name":"Zekai Shao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Zekai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126476888","display_name":"Kai Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126483060","display_name":"Siming Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Siming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.8102999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10799","display_name":"Data Visualization and Analytics","score":0.8102999925613403,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.030400000512599945,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.0210999995470047,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.7889999747276306},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.7329000234603882},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6845999956130981},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.5723999738693237},{"id":"https://openalex.org/keywords/exploratory-analysis","display_name":"Exploratory analysis","score":0.40709999203681946},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.4041000008583069},{"id":"https://openalex.org/keywords/exploratory-data-analysis","display_name":"Exploratory data analysis","score":0.35830000042915344},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3434999883174896},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.329800009727478}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8215000033378601},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.7889999747276306},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.7329000234603882},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6845999956130981},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.5723999738693237},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.42410001158714294},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.41830000281333923},{"id":"https://openalex.org/C3018260909","wikidata":"https://www.wikidata.org/wiki/Q1322871","display_name":"Exploratory analysis","level":2,"score":0.40709999203681946},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.4041000008583069},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.36079999804496765},{"id":"https://openalex.org/C120894424","wikidata":"https://www.wikidata.org/wiki/Q1322871","display_name":"Exploratory data analysis","level":2,"score":0.35830000042915344},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3434999883174896},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.33329999446868896},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.329800009727478},{"id":"https://openalex.org/C75553542","wikidata":"https://www.wikidata.org/wiki/Q178161","display_name":"A priori and a posteriori","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C51929080","wikidata":"https://www.wikidata.org/wiki/Q2425187","display_name":"Codebase","level":3,"score":0.3125999867916107},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3125},{"id":"https://openalex.org/C2780977526","wikidata":"https://www.wikidata.org/wiki/Q42417149","display_name":"Data exploration","level":3,"score":0.2944999933242798},{"id":"https://openalex.org/C2780613888","wikidata":"https://www.wikidata.org/wiki/Q6423394","display_name":"Knowledge retrieval","level":3,"score":0.2919999957084656},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2750999927520752},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2720000147819519},{"id":"https://openalex.org/C551230270","wikidata":"https://www.wikidata.org/wiki/Q4368942","display_name":"Data retrieval","level":2,"score":0.26989999413490295},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2694000005722046},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2678000032901764},{"id":"https://openalex.org/C175801342","wikidata":"https://www.wikidata.org/wiki/Q1988917","display_name":"Data analysis","level":2,"score":0.2635999917984009},{"id":"https://openalex.org/C120567893","wikidata":"https://www.wikidata.org/wiki/Q1582085","display_name":"Knowledge extraction","level":2,"score":0.26350000500679016},{"id":"https://openalex.org/C164614171","wikidata":"https://www.wikidata.org/wiki/Q5204775","display_name":"DECIPHER","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C64260653","wikidata":"https://www.wikidata.org/wiki/Q1194864","display_name":"Electronic design automation","level":2,"score":0.2574999928474426},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.25220000743865967}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.17215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.17215","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.17215","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"High-quality":[0],"exploratory":[1],"data":[2,9,149],"analysis":[3,34,51],"(EDA)":[4],"is":[5,44],"essential":[6],"in":[7,190],"the":[8,185],"science":[10],"pipeline,":[11],"but":[12,79],"remains":[13],"highly":[14],"dependent":[15],"on":[16],"analysts'":[17],"expertise":[18],"and":[19,36,39,56,88,113,121,144,154,172,193],"effort.":[20],"While":[21],"recent":[22],"LLM-based":[23],"approaches":[24],"partially":[25],"reduce":[26],"this":[27],"burden,":[28],"they":[29],"struggle":[30],"to":[31,75,90,118,150,165],"generate":[32,151],"effective":[33,167],"plans":[35],"appropriate":[37,174],"insights":[38],"visualizations":[40,153],"when":[41],"user":[42,110,179],"intent":[43],"abstract.":[45],"Meanwhile,":[46],"a":[47,72,106,178],"vast":[48],"collection":[49],"of":[50,187],"notebooks":[52,84,115],"produced":[53],"across":[54],"platforms":[55],"organizations":[57],"contains":[58],"rich":[59],"analytical":[60],"knowledge":[61,95],"that":[62,108],"can":[63],"potentially":[64],"guide":[65],"automated":[66,127],"EDA.":[67,98],"Retrieval-augmented":[68],"generation":[69],"(RAG)":[70],"provides":[71],"natural":[73],"way":[74],"leverage":[76],"such":[77],"corpora,":[78],"general":[80],"methods":[81],"often":[82],"treat":[83],"as":[85,116],"static":[86],"documents":[87],"fail":[89],"fully":[91],"exploit":[92],"their":[93],"potential":[94],"for":[96,126],"automating":[97],"To":[99],"address":[100],"these":[101],"limitations,":[102],"we":[103,132],"propose":[104],"NotebookRAG,":[105],"method":[107,189],"takes":[109],"intent,":[111],"datasets,":[112],"existing":[114],"input":[117],"retrieve,":[119],"enhance,":[120],"reuse":[122],"relevant":[123],"notebook":[124],"content":[125,164],"EDA":[128,168,195],"generation.":[129],"For":[130,157],"retrieval,":[131],"transform":[133],"code":[134],"cells":[135],"into":[136],"context-enriched":[137],"executable":[138],"components,":[139],"which":[140],"improve":[141],"retrieval":[142,163],"quality":[143],"enable":[145],"rerun":[146],"with":[147,181],"new":[148],"updated":[152],"reliable":[155],"insights.":[156],"generation,":[158],"an":[159],"agent":[160],"leverages":[161],"enhanced":[162],"construct":[166],"plans,":[169],"derive":[170],"insights,":[171],"produce":[173],"visualizations.":[175],"Evidence":[176],"from":[177],"study":[180],"24":[182],"participants":[183],"confirms":[184],"superiority":[186],"our":[188],"producing":[191],"high-quality":[192],"intent-aligned":[194],"notebooks.":[196]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-02-21T00:00:00"}
