{"id":"https://openalex.org/W4413845002","doi":"https://doi.org/10.14778/3734839.3734856","title":"Auto-Prep: Holistic Prediction of Data Preparation Steps for Self-Service Business Intelligence","display_name":"Auto-Prep: Holistic Prediction of Data Preparation Steps for Self-Service Business Intelligence","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4413845002","doi":"https://doi.org/10.14778/3734839.3734856"},"language":"en","primary_location":{"id":"doi:10.14778/3734839.3734856","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3734839.3734856","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114567474","display_name":"Eugenie Y. Lai","orcid":null},"institutions":[{"id":"https://openalex.org/I4210109586","display_name":"Moscow Institute of Thermal Technology","ror":"https://ror.org/021es5e59","country_code":"RU","type":"facility","lineage":["https://openalex.org/I4210109586"]}],"countries":["RU"],"is_corresponding":true,"raw_author_name":"Eugenie Y. Lai","raw_affiliation_strings":["MIT"],"affiliations":[{"raw_affiliation_string":"MIT","institution_ids":["https://openalex.org/I4210109586"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112931012","display_name":"Yeye He","orcid":null},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yeye He","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5038037154","display_name":"Surajit Chaudhuri","orcid":"https://orcid.org/0000-0001-8252-5270"},"institutions":[{"id":"https://openalex.org/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https://ror.org/05k87vq12","country_code":"GB","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210164937"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Surajit Chaudhuri","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https://openalex.org/I4210164937"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5114567474"],"corresponding_institution_ids":["https://openalex.org/I4210109586"],"apc_list":null,"apc_paid":null,"fwci":2.1935,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.90290637,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"18","issue":"7","first_page":"2212","last_page":"2225"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11891","display_name":"Big Data and Business Intelligence","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9973000288009644,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/business-intelligence","display_name":"Business intelligence","score":0.5689077377319336},{"id":"https://openalex.org/keywords/service","display_name":"Service (business)","score":0.5276973247528076},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4318792223930359},{"id":"https://openalex.org/keywords/self-service","display_name":"Self-service","score":0.42799362540245056},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3564499616622925},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.2520480751991272},{"id":"https://openalex.org/keywords/business","display_name":"Business","score":0.1809530258178711},{"id":"https://openalex.org/keywords/marketing","display_name":"Marketing","score":0.10669955611228943},{"id":"https://openalex.org/keywords/computer-security","display_name":"Computer security","score":0.10066592693328857}],"concepts":[{"id":"https://openalex.org/C2767350","wikidata":"https://www.wikidata.org/wiki/Q6662173","display_name":"Business intelligence","level":2,"score":0.5689077377319336},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.5276973247528076},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4318792223930359},{"id":"https://openalex.org/C2775906791","wikidata":"https://www.wikidata.org/wiki/Q1369310","display_name":"Self-service","level":2,"score":0.42799362540245056},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3564499616622925},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2520480751991272},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.1809530258178711},{"id":"https://openalex.org/C162853370","wikidata":"https://www.wikidata.org/wiki/Q39809","display_name":"Marketing","level":1,"score":0.10669955611228943},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.10066592693328857}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.14778/3734839.3734856","is_oa":false,"landing_page_url":"https://doi.org/10.14778/3734839.3734856","pdf_url":null,"source":{"id":"https://openalex.org/S4210226185","display_name":"Proceedings of the VLDB Endowment","issn_l":"2150-8097","issn":["2150-8097"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the VLDB Endowment","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":37,"referenced_works":["https://openalex.org/W1495859460","https://openalex.org/W1995999642","https://openalex.org/W2003584736","https://openalex.org/W2013167115","https://openalex.org/W2014267653","https://openalex.org/W2018223116","https://openalex.org/W2083755493","https://openalex.org/W2112129552","https://openalex.org/W2128793454","https://openalex.org/W2146105230","https://openalex.org/W2152922709","https://openalex.org/W2260484439","https://openalex.org/W2295598076","https://openalex.org/W2394595640","https://openalex.org/W2401610261","https://openalex.org/W2496170334","https://openalex.org/W2548695521","https://openalex.org/W2612824201","https://openalex.org/W2616147950","https://openalex.org/W2626990892","https://openalex.org/W2884499287","https://openalex.org/W2913173548","https://openalex.org/W2948163032","https://openalex.org/W2952611840","https://openalex.org/W2980180246","https://openalex.org/W2980728347","https://openalex.org/W3027879771","https://openalex.org/W3030026364","https://openalex.org/W3082197983","https://openalex.org/W3082849379","https://openalex.org/W3170190513","https://openalex.org/W4238813738","https://openalex.org/W4252980914","https://openalex.org/W4254907083","https://openalex.org/W4286447321","https://openalex.org/W4301391075","https://openalex.org/W4393183687"],"related_works":["https://openalex.org/W4327648025","https://openalex.org/W4405308738","https://openalex.org/W4381664321","https://openalex.org/W2889392607","https://openalex.org/W4241504035","https://openalex.org/W2146681649","https://openalex.org/W2563093951","https://openalex.org/W2366000099","https://openalex.org/W3135525263","https://openalex.org/W2307354236"],"abstract_inverted_index":{"Business":[0],"Intelligence":[1],"(BI)":[2],"plays":[3],"a":[4,20,66,175],"critical":[5],"role":[6],"in":[7,143],"empowering":[8],"modern":[9],"enterprises":[10],"to":[11,46,64,104,155,168],"make":[12],"informed":[13],"data-driven":[14],"decisions,":[15],"and":[16,29,111,125,137,172,201],"has":[17],"grown":[18],"into":[19],"billion-dollar":[21],"business.":[22],"Self-service":[23],"BI":[24,28,62,71,83,96,132,146,190],"tools":[25],"like":[26],"Power":[27],"Tableau":[30],"have":[31,103],"democratized":[32],"the":[33,57,91,95,131,144],"\"dashboarding\"":[34],"phase":[35,60,93],"of":[36,61,94,130],"BI,":[37],"by":[38,180],"offering":[39],"user-friendly,":[40],"drag-and-drop":[41],"interfaces":[42],"that":[43,56,100,135,149,193],"are":[44,140],"tailored":[45],"non-technical":[47],"enterprise":[48],"users.":[49],"However,":[50],"despite":[51],"these":[52,158],"advances,":[53],"we":[54,77,163],"observe":[55,99],"\"data":[58],"preparation\"":[59],"continues":[63],"be":[65,121],"key":[67],"pain":[68],"point":[69],"for":[70,123],"users":[72,101],"today.":[73],"In":[74],"this":[75,161],"work,":[76],"systematically":[78],"study":[79,129],"around":[80],"2K":[81],"real":[82,189],"projects":[84,191],"harvested":[85],"from":[86],"public":[87],"sources,":[88],"focusing":[89],"on":[90],"data-preparation":[92],"workflows.":[97],"We":[98],"often":[102,141],"program":[105],"both":[106,151],"(1)":[107],"data":[108,119],"transformation":[109,136,200],"steps":[110,139],"(2)":[112],"table":[113],"joins":[114],"steps,":[115,203],"before":[116],"their":[117],"raw":[118],"can":[120,195],"ready":[122],"dashboarding":[124],"analysis.":[126],"A":[127],"careful":[128],"workflows":[133],"reveals":[134],"join":[138,202],"intertwined":[142],"same":[145],"project,":[147],"such":[148,214],"considering":[150],"holistically":[152,169],"is":[153],"crucial":[154],"accurately":[156],"predict":[157,170,197],"steps.":[159],"Leveraging":[160],"observation,":[162],"develop":[164],"an":[165],"Auto-Prep":[166,194],"system":[167],"transformations":[171],"joins,":[173],"using":[174,188],"principled":[176],"graph-based":[177],"algorithm":[178],"inspired":[179],"Steiner-tree,":[181],"with":[182],"provable":[183],"quality":[184],"guarantees.":[185],"Extensive":[186],"evaluations":[187],"suggest":[192],"correctly":[196],"over":[198],"70%":[199],"significantly":[204],"more":[205],"accurate":[206],"than":[207],"existing":[208],"algorithms":[209],"as":[210,212,215],"well":[211],"language-models":[213],"GPT-4.":[216]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
