{"id":"https://openalex.org/W4407355407","doi":"https://doi.org/10.1145/3709677","title":"<scp>DataVinci:</scp> Learning Syntactic and Semantic String Repairs","display_name":"<scp>DataVinci:</scp> Learning Syntactic and Semantic String Repairs","publication_year":2025,"publication_date":"2025-02-10","ids":{"openalex":"https://openalex.org/W4407355407","doi":"https://doi.org/10.1145/3709677"},"language":"en","primary_location":{"id":"doi:10.1145/3709677","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3709677","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058098675","display_name":"Mukul Singh","orcid":"https://orcid.org/0000-0001-9510-4512"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mukul Singh","raw_affiliation_strings":["Microsoft, Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0001-9510-4512","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046282128","display_name":"Jos\u00e9 Cambronero","orcid":"https://orcid.org/0000-0002-0713-6141"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jos\u00e9 Cambronero","raw_affiliation_strings":["Microsoft, Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0002-0713-6141","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011543162","display_name":"Sumit Gulwani","orcid":"https://orcid.org/0000-0002-9226-9634"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sumit Gulwani","raw_affiliation_strings":["Microsoft Research, Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0002-9226-9634","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051355395","display_name":"Vu Le","orcid":"https://orcid.org/0000-0003-3727-3291"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vu Le","raw_affiliation_strings":["Microsoft, Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0003-3727-3291","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082623545","display_name":"Carina Negreanu","orcid":"https://orcid.org/0000-0003-2130-7223"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Carina Negreanu","raw_affiliation_strings":["Robin AI, Cambridge, United Kingdom"],"raw_orcid":"https://orcid.org/0000-0003-2130-7223","affiliations":[{"raw_affiliation_string":"Robin AI, Cambridge, United Kingdom","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000188805","display_name":"Arjun Radhakrishna","orcid":"https://orcid.org/0000-0002-5559-5932"},"institutions":[{"id":"https://openalex.org/I1290206253","display_name":"Microsoft (United States)","ror":"https://ror.org/00d0nc645","country_code":"US","type":"company","lineage":["https://openalex.org/I1290206253"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arjun Radhakrishna","raw_affiliation_strings":["Microsoft, Redmond, Washington, USA"],"raw_orcid":"https://orcid.org/0000-0002-5559-5932","affiliations":[{"raw_affiliation_string":"Microsoft, Redmond, Washington, USA","institution_ids":["https://openalex.org/I1290206253"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055735931","display_name":"Gust Verbruggen","orcid":"https://orcid.org/0000-0001-9182-597X"},"institutions":[{"id":"https://openalex.org/I4210151458","display_name":"Microsoft (Belgium)","ror":"https://ror.org/05168yk81","country_code":"BE","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210151458"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Gust Verbruggen","raw_affiliation_strings":["Microsoft, Keerbergen, Belgium"],"raw_orcid":"https://orcid.org/0000-0001-9182-597X","affiliations":[{"raw_affiliation_string":"Microsoft, Keerbergen, Belgium","institution_ids":["https://openalex.org/I4210151458"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.2763,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.94589271,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":"3","issue":"1","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9979000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/string","display_name":"String (physics)","score":0.6658686399459839},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5849520564079285},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5243328213691711},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.38860195875167847},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.33574891090393066},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09329482913017273},{"id":"https://openalex.org/keywords/philosophy","display_name":"Philosophy","score":0.06918874382972717}],"concepts":[{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.6658686399459839},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5849520564079285},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5243328213691711},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38860195875167847},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.33574891090393066},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09329482913017273},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.06918874382972717},{"id":"https://openalex.org/C37914503","wikidata":"https://www.wikidata.org/wiki/Q156495","display_name":"Mathematical physics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3709677","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3709677","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":23,"referenced_works":["https://openalex.org/W2106895292","https://openalex.org/W2202043092","https://openalex.org/W2437617937","https://openalex.org/W2548122763","https://openalex.org/W2591700809","https://openalex.org/W2798323405","https://openalex.org/W2898335306","https://openalex.org/W2929941791","https://openalex.org/W2943955885","https://openalex.org/W2948145720","https://openalex.org/W2951621897","https://openalex.org/W2981852735","https://openalex.org/W3004034804","https://openalex.org/W3034942164","https://openalex.org/W3082197983","https://openalex.org/W3105977086","https://openalex.org/W3205927779","https://openalex.org/W3206370442","https://openalex.org/W4210643485","https://openalex.org/W4288089799","https://openalex.org/W4372046852","https://openalex.org/W4387667125","https://openalex.org/W6676014748"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W3204019825"],"abstract_inverted_index":{"String":[0],"data":[1,30,87,117,126,169,186,192,199,214],"is":[2],"common":[3],"in":[4,10,67,103,173],"real-world":[5],"datasets:":[6],"67.6%":[7],"of":[8,13,101,161],"values":[9,102,108,140],"a":[11,33,83,99,104],"sample":[12],"1.8":[14],"million":[15],"real":[16],"Excel":[17],"spreadsheets":[18],"from":[19,183],"the":[20,47,56,125,130,190],"web":[21],"were":[22],"represented":[23],"as":[24,116,141,193,219],"text.":[25],"Automatically":[26],"cleaning":[27],"such":[28,113],"string":[29,86],"can":[31,120,171,179],"have":[32],"significant":[34],"impact":[35],"on":[36,61,129,212,221],"users.":[37],"Previous":[38],"approaches":[39],"are":[40,164],"limited":[41],"to":[42,54,124,156,195],"error":[43,88,127,215],"detection,":[44],"require":[45],"that":[46,71,97,109,163,201],"user":[48],"provides":[49],"annotations,":[50],"examples,":[51],"or":[52,64],"constraints":[53],"fix":[55],"errors,":[57],"and":[58,77,90,106,133,149,197,217,224],"focus":[59],"independently":[60],"syntactic":[62,76,148],"errors":[63,66],"semantic":[65,78,150],"strings,":[68],"but":[69],"ignore":[70],"strings":[72,145,162],"often":[73],"contain":[74],"both":[75,147,213],"substrings.":[79],"We":[80],"introduce":[81],"DataVinci,":[82],"fully":[84],"unsupervised":[85],"detection":[89,216],"repair":[91,218],"system.":[92],"DataVinci":[93,119,152,178,207],"learns":[94],"regular-expression-based":[95],"patterns":[96,115,132],"cover":[98],"majority":[100,114,131,139,174],"column":[105],"reports":[107],"do":[110],"not":[111,167,203],"satisfy":[112],"errors.":[118],"automatically":[121],"derive":[122],"edits":[123],"based":[128],"using":[134],"row":[135],"tuples":[136],"associated":[137],"with":[138,146],"examples.":[142],"To":[143],"handle":[144],"substrings,":[151],"uses":[153,189],"an":[154,184],"LLM":[155],"abstract":[157],"(and":[158],"re-concretize)":[159],"portions":[160],"semantic.":[165],"Because":[166],"all":[168],"columns":[170],"result":[172],"patterns,":[175],"when":[176],"available,":[177],"leverage":[180],"execution":[181],"information":[182],"existing":[185,223],"program":[187],"(which":[188],"target":[191],"input)":[194],"identify":[196],"correct":[198],"repairs":[200],"would":[202],"otherwise":[204],"be":[205],"identified.":[206],"outperforms":[208],"eleven":[209],"baseline":[210],"systems":[211],"demonstrated":[220],"four":[222],"new":[225],"benchmarks.":[226]},"counts_by_year":[{"year":2026,"cited_by_count":3}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
