{"id":"https://openalex.org/W4367047159","doi":"https://doi.org/10.1145/3543507.3583515","title":"Human-in-the-loop Regular Expression Extraction for Single Column Format Inconsistency","display_name":"Human-in-the-loop Regular Expression Extraction for Single Column Format Inconsistency","publication_year":2023,"publication_date":"2023-04-26","ids":{"openalex":"https://openalex.org/W4367047159","doi":"https://doi.org/10.1145/3543507.3583515"},"language":"en","primary_location":{"id":"doi:10.1145/3543507.3583515","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3543507.3583515","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2023","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033215704","display_name":"Shaochen Yu","orcid":"https://orcid.org/0000-0002-4526-1525"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Shaochen Yu","raw_affiliation_strings":["The University of Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101540918","display_name":"Lei Han","orcid":"https://orcid.org/0000-0002-7777-3592"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Lei Han","raw_affiliation_strings":["The University of Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019857678","display_name":"Marta Indulska","orcid":"https://orcid.org/0000-0002-2156-4097"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Marta Indulska","raw_affiliation_strings":["The University of Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070591850","display_name":"Shazia Sadiq","orcid":"https://orcid.org/0000-0001-6739-4145"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Shazia Sadiq","raw_affiliation_strings":["The University of Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Australia","institution_ids":["https://openalex.org/I165143802"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052565959","display_name":"Gianluca Demartini","orcid":"https://orcid.org/0000-0002-7311-3693"},"institutions":[{"id":"https://openalex.org/I165143802","display_name":"University of Queensland","ror":"https://ror.org/00rqy9422","country_code":"AU","type":"education","lineage":["https://openalex.org/I165143802"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Gianluca Demartini","raw_affiliation_strings":["The University of Queensland, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Queensland, Australia","institution_ids":["https://openalex.org/I165143802"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5033215704"],"corresponding_institution_ids":["https://openalex.org/I165143802"],"apc_list":null,"apc_paid":null,"fwci":0.9178,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.77590646,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"3859","last_page":"3867"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9962999820709229,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/column","display_name":"Column (typography)","score":0.7011204957962036},{"id":"https://openalex.org/keywords/expression","display_name":"Expression (computer science)","score":0.588917076587677},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5565045475959778},{"id":"https://openalex.org/keywords/loop","display_name":"Loop (graph theory)","score":0.5455155372619629},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.5246924757957458},{"id":"https://openalex.org/keywords/regular-expression","display_name":"Regular expression","score":0.4771137535572052},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.29649052023887634},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1999759078025818},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.19711703062057495},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.13992872834205627},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.08774280548095703},{"id":"https://openalex.org/keywords/combinatorics","display_name":"Combinatorics","score":0.07434055209159851}],"concepts":[{"id":"https://openalex.org/C2780551164","wikidata":"https://www.wikidata.org/wiki/Q2306599","display_name":"Column (typography)","level":3,"score":0.7011204957962036},{"id":"https://openalex.org/C90559484","wikidata":"https://www.wikidata.org/wiki/Q778379","display_name":"Expression (computer science)","level":2,"score":0.588917076587677},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5565045475959778},{"id":"https://openalex.org/C184670325","wikidata":"https://www.wikidata.org/wiki/Q512604","display_name":"Loop (graph theory)","level":2,"score":0.5455155372619629},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.5246924757957458},{"id":"https://openalex.org/C121329065","wikidata":"https://www.wikidata.org/wiki/Q185612","display_name":"Regular expression","level":2,"score":0.4771137535572052},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.29649052023887634},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1999759078025818},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.19711703062057495},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.13992872834205627},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.08774280548095703},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.07434055209159851},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3543507.3583515","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3543507.3583515","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2023","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.4399999976158142}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W72959484","https://openalex.org/W1689950805","https://openalex.org/W1992479406","https://openalex.org/W1993685897","https://openalex.org/W2038941723","https://openalex.org/W2039532210","https://openalex.org/W2064766209","https://openalex.org/W2077482713","https://openalex.org/W2080666934","https://openalex.org/W2106950427","https://openalex.org/W2126848435","https://openalex.org/W2132525863","https://openalex.org/W2275294428","https://openalex.org/W2398018337","https://openalex.org/W2406050948","https://openalex.org/W2421097601","https://openalex.org/W2531982678","https://openalex.org/W2612824201","https://openalex.org/W2787466570","https://openalex.org/W2978929516","https://openalex.org/W2979577724","https://openalex.org/W3035376455","https://openalex.org/W3206978869","https://openalex.org/W4212966400","https://openalex.org/W4213342056","https://openalex.org/W4240301789"],"related_works":["https://openalex.org/W2563334590","https://openalex.org/W24774503","https://openalex.org/W2807616588","https://openalex.org/W3094387502","https://openalex.org/W2014821076","https://openalex.org/W2922478741","https://openalex.org/W2181722423","https://openalex.org/W2119827098","https://openalex.org/W2052853109","https://openalex.org/W2101092231"],"abstract_inverted_index":{"Format":[0],"inconsistency":[1],"is":[2,123,156],"one":[3],"of":[4],"the":[5,91,101,130,135,153],"most":[6],"frequently":[7],"appearing":[8],"data":[9,14,71,161],"quality":[10],"issues":[11],"encountered":[12],"during":[13],"cleaning.":[15],"Existing":[16],"automated":[17],"approaches":[18,25],"commonly":[19],"lack":[20],"applicability":[21],"and":[22,75,82,108,145,148,158,163],"generalisability,":[23],"while":[24],"with":[26],"human":[27,118],"inputs":[28],"typically":[29],"require":[30],"specialized":[31],"skills":[32],"such":[33],"as":[34,132,134],"writing":[35],"regular":[36,92,121],"expressions.":[37],"This":[38],"paper":[39],"proposes":[40],"a":[41,57,84,114],"novel":[42,85],"hybrid":[43],"human-machine":[44],"system,":[45],"namely":[46],"\u201cData-Scanner-4C\u201d,":[47],"which":[48],"leverages":[49],"crowdsourcing":[50,107],"to":[51,66,89,100],"address":[52],"syntactic":[53],"format":[54,110],"inconsistencies":[55],"in":[56,113],"single":[58,115],"column":[59],"effectively.":[60],"We":[61,139],"first":[62],"ask":[63],"crowd":[64],"workers":[65],"create":[67],"examples":[68,99],"from":[69,97],"single-column":[70],"through":[72,142],"\u201cdata":[73],"selection\u201d":[74],"\u201cresult":[76],"validation\u201d":[77],"tasks.":[78],"Then,":[79],"we":[80],"propose":[81],"use":[83],"rule-based":[86],"learning":[87],"algorithm":[88],"infer":[90],"expressions":[93,122],"that":[94],"propagate":[95],"formats":[96],"created":[98],"entire":[102],"column.":[103],"Our":[104],"system":[105],"integrates":[106],"algorithmic":[109],"extraction":[111],"techniques":[112],"workflow.":[116],"Having":[117],"experts":[119],"write":[120],"no":[124],"longer":[125],"required,":[126],"thereby":[127],"reducing":[128],"both":[129,143],"time":[131],"well":[133],"opportunity":[136],"for":[137],"error.":[138],"conducted":[140],"experiments":[141],"synthetic":[144],"real-world":[146],"datasets,":[147],"our":[149],"results":[150],"show":[151],"how":[152],"proposed":[154],"approach":[155],"applicable":[157],"effective":[159],"across":[160],"types":[162],"formats.":[164]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
