{"id":"https://openalex.org/W4407123412","doi":"https://doi.org/10.48550/arxiv.2502.00329","title":"CoddLLM: Empowering Large Language Models for Data Analytics","display_name":"CoddLLM: Empowering Large Language Models for Data Analytics","publication_year":2025,"publication_date":"2025-02-01","ids":{"openalex":"https://openalex.org/W4407123412","doi":"https://doi.org/10.48550/arxiv.2502.00329"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.00329","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.00329","pdf_url":"https://arxiv.org/pdf/2502.00329","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.00329","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040806743","display_name":"Jiani Zhang","orcid":"https://orcid.org/0000-0003-0074-6761"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Jiani","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002532818","display_name":"Hengrui Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hengrui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030014440","display_name":"Rishav Chakravarti","orcid":"https://orcid.org/0000-0002-1612-8231"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chakravarti, Rishav","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102837904","display_name":"Yiqun Hu","orcid":"https://orcid.org/0000-0001-9157-7865"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yiqun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102761306","display_name":"Patrick Ng","orcid":"https://orcid.org/0000-0001-8208-652X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ng, Patrick","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002932353","display_name":"Asterios Katsifodimos","orcid":"https://orcid.org/0000-0002-6717-2945"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Katsifodimos, Asterios","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006581225","display_name":"Huzefa Rangwala","orcid":"https://orcid.org/0000-0003-0435-0035"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rangwala, Huzefa","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082384108","display_name":"George Karypis","orcid":"https://orcid.org/0000-0003-2753-1437"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Karypis, George","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5067621853","display_name":"Alon Halevy","orcid":"https://orcid.org/0000-0002-8717-7356"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Halevy, Alon","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5040806743"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8410999774932861,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8410999774932861,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T14280","display_name":"Big Data Technologies and Applications","score":0.7836999893188477,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.6962000131607056,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/analytics","display_name":"Analytics","score":0.5746637582778931},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5614174604415894},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.472749799489975},{"id":"https://openalex.org/keywords/data-analysis","display_name":"Data analysis","score":0.45443007349967957},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.1455335021018982}],"concepts":[{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.5746637582778931},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5614174604415894},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.472749799489975},{"id":"https://openalex.org/C175801342","wikidata":"https://www.wikidata.org/wiki/Q1988917","display_name":"Data analysis","level":2,"score":0.45443007349967957},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.1455335021018982}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.00329","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.00329","pdf_url":"https://arxiv.org/pdf/2502.00329","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.00329","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.00329","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.00329","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.00329","pdf_url":"https://arxiv.org/pdf/2502.00329","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4407123412.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4226266853","https://openalex.org/W4210252074","https://openalex.org/W3092201768","https://openalex.org/W2796632413","https://openalex.org/W2740083192","https://openalex.org/W2508885301","https://openalex.org/W2794907032","https://openalex.org/W4255802207","https://openalex.org/W4298181270","https://openalex.org/W4385267788"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"have":[4],"the":[5,35,88,127,153,162,192,219,249],"potential":[6],"to":[7,69,248],"revolutionize":[8],"data":[9,16,43,55,64,83,98,138,165,179,187,203],"analytics":[10,44,73],"by":[11,233],"simplifying":[12],"tasks":[13,107,118],"such":[14,117],"as":[15,29],"discovery":[17],"and":[18,66,100,112,126,133,156,181,202,238],"SQL":[19],"query":[20],"synthesis":[21],"through":[22],"natural":[23,131],"language":[24,132,154],"interactions.":[25],"This":[26],"work":[27],"serves":[28],"a":[30,53,80,91,142,170,215],"pivotal":[31],"first":[32],"step":[33],"toward":[34],"development":[36],"of":[37,63,90,94,123,159,164,174,194,243],"foundation":[38,144],"models":[39],"explicitly":[40],"designed":[41],"for":[42,57],"applications.":[45],"To":[46,151],"propel":[47],"this":[48,137],"vision":[49],"forward,":[50],"we":[51,103,140,167],"unveil":[52],"new":[54,106,143,216],"recipe":[56],"post-training":[58],"LLMs,":[59],"enhancing":[60],"their":[61],"comprehension":[62],"management":[65],"empowering":[67],"them":[68],"tackle":[70],"complex":[71],"real-world":[72],"tasks.":[74],"Specifically,":[75],"our":[76],"innovative":[77],"approach":[78],"includes":[79],"scalable":[81],"synthetic":[82],"generation":[84],"method":[85],"that":[86,108,116,198],"enables":[87],"creation":[89,125],"broad":[92],"spectrum":[93],"topics":[95],"centered":[96],"on":[97,149,177,186,229],"representation":[99],"manipulation.":[101],"Furthermore,":[102],"introduce":[104],"two":[105],"seamlessly":[109],"bridge":[110],"tables":[111],"text.":[113],"We":[114],"show":[115],"can":[119],"enhance":[120],"models'":[121],"understanding":[122,155],"schema":[124],"nuanced":[128],"translation":[129],"between":[130],"tabular":[134],"data.":[135],"Leveraging":[136],"recipe,":[139],"post-train":[141],"model,":[145],"named":[146],"CoddLLM,":[147],"based":[148],"Mistral-NeMo-12B.":[150],"assess":[152],"reasoning":[157],"capabilities":[158],"LLMs":[160],"in":[161,191,210,235,245],"realm":[163],"analytics,":[166],"contribute":[168],"AnalyticsMMLU,":[169,230],"benchmark":[171],"containing":[172],"thousands":[173],"multiple-choice":[175],"questions":[176],"databases,":[178],"analysis,":[180],"machine":[182],"learning.":[183],"Our":[184],"focus":[185],"discovery,":[188],"has":[189],"resulted":[190],"contribution":[193],"three":[195],"comprehensive":[196],"benchmarks":[197],"address":[199],"both":[200],"database":[201],"lake":[204],"scenarios.":[205],"CoddLLM":[206],"not":[207],"only":[208],"excels":[209],"performance":[211],"but":[212],"also":[213],"sets":[214],"standard,":[217],"achieving":[218],"highest":[220],"average":[221,241],"accuracy":[222],"across":[223],"eight":[224],"datasets.":[225],"It":[226],"outperforms":[227],"GPT-3.5-Turbo":[228],"exceeding":[231],"GPT-4o":[232],"12.1%":[234],"table":[236],"selection":[237],"showing":[239],"an":[240],"improvement":[242],"24.9%":[244],"Text-to-SQL":[246],"compared":[247],"base":[250],"model.":[251]},"counts_by_year":[],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
