{"id":"https://openalex.org/W7140186931","doi":"https://doi.org/10.48550/arxiv.2603.20311","title":"kRAIG: A Natural Language-Driven Agent for Automated DataOps Pipeline Generation","display_name":"kRAIG: A Natural Language-Driven Agent for Automated DataOps Pipeline Generation","publication_year":2026,"publication_date":"2026-03-19","ids":{"openalex":"https://openalex.org/W7140186931","doi":"https://doi.org/10.48550/arxiv.2603.20311"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.20311","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20311","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.20311","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Siva, Rohan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Siva, Rohan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cheung, Kai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheung, Kai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Lichi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lichi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Sundaram, Ganesh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sundaram, Ganesh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.19619999825954437,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.19619999825954437,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.1664000004529953,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.121799997985363,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.7146999835968018},{"id":"https://openalex.org/keywords/executable","display_name":"Executable","score":0.6682999730110168},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6668000221252441},{"id":"https://openalex.org/keywords/orchestration","display_name":"Orchestration","score":0.580299973487854},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.4401000142097473},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.415800005197525},{"id":"https://openalex.org/keywords/ambiguity","display_name":"Ambiguity","score":0.4027000069618225},{"id":"https://openalex.org/keywords/pipeline-transport","display_name":"Pipeline transport","score":0.400299996137619},{"id":"https://openalex.org/keywords/data-transformation","display_name":"Data transformation","score":0.3903999924659729}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7443000078201294},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.7146999835968018},{"id":"https://openalex.org/C160145156","wikidata":"https://www.wikidata.org/wiki/Q778586","display_name":"Executable","level":2,"score":0.6682999730110168},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6668000221252441},{"id":"https://openalex.org/C199168358","wikidata":"https://www.wikidata.org/wiki/Q3367000","display_name":"Orchestration","level":3,"score":0.580299973487854},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.4401000142097473},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.4165000021457672},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.415800005197525},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.4027000069618225},{"id":"https://openalex.org/C175309249","wikidata":"https://www.wikidata.org/wiki/Q725864","display_name":"Pipeline transport","level":2,"score":0.400299996137619},{"id":"https://openalex.org/C150670458","wikidata":"https://www.wikidata.org/wiki/Q4272815","display_name":"Data transformation","level":3,"score":0.3903999924659729},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.38280001282691956},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.3806999921798706},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.34360000491142273},{"id":"https://openalex.org/C2780440489","wikidata":"https://www.wikidata.org/wiki/Q5227278","display_name":"Data-driven","level":2,"score":0.323199987411499},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.32100000977516174},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.31470000743865967},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.30809998512268066},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29649999737739563},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.2955000102519989},{"id":"https://openalex.org/C2779791154","wikidata":"https://www.wikidata.org/wiki/Q258040","display_name":"Model transformation","level":3,"score":0.2897000014781952},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2849000096321106},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2761000096797943},{"id":"https://openalex.org/C92446256","wikidata":"https://www.wikidata.org/wiki/Q3306762","display_name":"Data validation","level":2,"score":0.2705000042915344},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.2556000053882599}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.20311","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20311","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.20311","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20311","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6801789999008179}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"machine":[1],"learning":[2],"systems":[3],"rely":[4],"on":[5],"complex":[6],"data":[7,16,31,112,130,191],"engineering":[8,192],"workflows":[9,176],"to":[10,105,144,166],"extract,":[11],"transform,":[12],"and":[13,26,33,63,117,132,154,157,181,187],"load":[14],"(ELT)":[15],"into":[17,80],"production":[18],"pipelines.":[19,193],"However,":[20],"constructing":[21],"these":[22,50],"pipelines":[23],"remains":[24],"time-consuming":[25],"requires":[27],"substantial":[28],"expertise":[29],"in":[30,38,88,152,162],"infrastructure":[32],"orchestration":[34],"frameworks.":[35],"Recent":[36],"advances":[37],"large":[39],"language":[40,78],"model":[41],"(LLM)":[42],"agents":[43],"offer":[44],"a":[45,123,149,158],"potential":[46],"path":[47],"toward":[48],"automating":[49],"workflows,":[51],"but":[52],"existing":[53],"approaches":[54],"struggle":[55],"with":[56,177],"under-specified":[57],"user":[58,89],"intent,":[59,90],"unreliable":[60],"tool":[61,125],"generation,":[62],"limited":[64],"guarantees":[65],"of":[66,189],"executable":[67],"outputs.":[68],"We":[69],"introduce":[70],"kRAIG,":[71],"an":[72,97],"AI":[73],"agent":[74,175],"that":[75,100,139,173],"translates":[76],"natural":[77],"specifications":[79],"production-ready":[81],"Kubeflow":[82],"Pipelines":[83],"(KFP).":[84],"To":[85,128],"resolve":[86],"ambiguity":[87],"we":[91],"propose":[92],"ReQuesAct":[93],"(Reason,":[94],"Question,":[95],"Act),":[96],"interaction":[98],"framework":[99,147],"explicitly":[101],"clarifies":[102],"intent":[103,179],"prior":[104,143],"pipeline":[106,141],"synthesis.":[107],"The":[108],"system":[109],"orchestrates":[110],"end-to-end":[111],"movement":[113],"from":[114],"diverse":[115],"sources":[116],"generates":[118],"task-specific":[119],"transformation":[120,163],"components":[121],"through":[122],"retrieval-augmented":[124],"synthesis":[126],"process.":[127],"ensure":[129],"quality":[131],"safety,":[133],"kRAIG":[134],"incorporates":[135],"LLM-based":[136],"validation":[137,182],"stages":[138],"verify":[140],"integrity":[142],"execution.":[145],"Our":[146],"achieves":[148],"3x":[150],"improvement":[151],"extraction":[153],"loading":[155],"success":[156],"25":[159],"percent":[160],"increase":[161],"accuracy":[164],"compared":[165],"state-of-the-art":[167],"agentic":[168],"baselines.":[169],"These":[170],"improvements":[171],"demonstrate":[172],"structured":[174],"explicit":[178],"clarification":[180],"significantly":[183],"enhance":[184],"the":[185],"reliability":[186],"executability":[188],"automated":[190]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
