{"id":"https://openalex.org/W4416798144","doi":"https://doi.org/10.1109/access.2025.3638723","title":"A Hybrid Framework for Scalable Data Quality: Comparing PySpark and AI-Powered Validation in Microsoft Fabric","display_name":"A Hybrid Framework for Scalable Data Quality: Comparing PySpark and AI-Powered Validation in Microsoft Fabric","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416798144","doi":"https://doi.org/10.1109/access.2025.3638723"},"language":null,"primary_location":{"id":"doi:10.1109/access.2025.3638723","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3638723","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2025.3638723","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092472890","display_name":"Dinesh Eswararaj","orcid":"https://orcid.org/0009-0008-4501-6248"},"institutions":[{"id":"https://openalex.org/I4210088288","display_name":"Compact Imaging (United States)","ror":"https://ror.org/0072bx170","country_code":"US","type":"company","lineage":["https://openalex.org/I4210088288"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Dinesh Eswararaj","raw_affiliation_strings":["Compunnel Software Inc., Irvine, CA, USA","Lead Data Engineer, Compunnel Software Inc, Irvine, CA, USA"],"raw_orcid":"https://orcid.org/0009-0008-4501-6248","affiliations":[{"raw_affiliation_string":"Compunnel Software Inc., Irvine, CA, USA","institution_ids":["https://openalex.org/I4210088288"]},{"raw_affiliation_string":"Lead Data Engineer, Compunnel Software Inc, Irvine, CA, USA","institution_ids":["https://openalex.org/I4210088288"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119911177","display_name":"Vandana Kollati","orcid":"https://orcid.org/0009-0008-2829-3198"},"institutions":[{"id":"https://openalex.org/I1320354487","display_name":"Target (United States)","ror":"https://ror.org/05mkyac79","country_code":"US","type":"company","lineage":["https://openalex.org/I1320354487"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vandana Kollati","raw_affiliation_strings":["Sogeti, Concord, NC, USA","Technical Data Architect, Sogeti, Concord, NC, USA"],"raw_orcid":"https://orcid.org/0009-0008-2829-3198","affiliations":[{"raw_affiliation_string":"Sogeti, Concord, NC, USA","institution_ids":["https://openalex.org/I1320354487"]},{"raw_affiliation_string":"Technical Data Architect, Sogeti, Concord, NC, USA","institution_ids":["https://openalex.org/I1320354487"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5095772298","display_name":"Lakshmana Rao Koppada","orcid":"https://orcid.org/0009-0005-6747-191X"},"institutions":[{"id":"https://openalex.org/I2801809184","display_name":"University of South Carolina Union","ror":"https://ror.org/04p549618","country_code":"US","type":"education","lineage":["https://openalex.org/I2801809184"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Lakshmana Rao Koppada","raw_affiliation_strings":["PwC, Union City, CA, USA","Technical Architect, PWC, Union City, CA, USA"],"raw_orcid":"https://orcid.org/0009-0005-6747-191X","affiliations":[{"raw_affiliation_string":"PwC, Union City, CA, USA","institution_ids":["https://openalex.org/I2801809184"]},{"raw_affiliation_string":"Technical Architect, PWC, Union City, CA, USA","institution_ids":["https://openalex.org/I2801809184"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120670582","display_name":"Ram Sekhar Bodala","orcid":"https://orcid.org/0009-0005-4646-6679"},"institutions":[{"id":"https://openalex.org/I2801576346","display_name":"Amtrak (United States)","ror":"https://ror.org/00zwndh37","country_code":"US","type":"company","lineage":["https://openalex.org/I2801576346"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"RAM Sekhar Bodala","raw_affiliation_strings":["Amtrak, Wilmington, DE, USA","Principal software Engineer, Amtrak, Wilmington, DE, USA"],"raw_orcid":"https://orcid.org/0009-0005-4646-6679","affiliations":[{"raw_affiliation_string":"Amtrak, Wilmington, DE, USA","institution_ids":["https://openalex.org/I2801576346"]},{"raw_affiliation_string":"Principal software Engineer, Amtrak, Wilmington, DE, USA","institution_ids":["https://openalex.org/I2801576346"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120570630","display_name":"Ajay Babu Nellipudi","orcid":"https://orcid.org/0009-0005-5320-2065"},"institutions":[{"id":"https://openalex.org/I1292842697","display_name":"Environmental Systems Research Institute (United States)","ror":"https://ror.org/0428exr50","country_code":"US","type":"company","lineage":["https://openalex.org/I1292842697"]},{"id":"https://openalex.org/I155673605","display_name":"Union University","ror":"https://ror.org/02c561939","country_code":"US","type":"education","lineage":["https://openalex.org/I155673605"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ajay Babu Nellipudi","raw_affiliation_strings":["Sr. Applications Developer/Architect, ESRI, Redlands, CA, USA","Senior Application Architect, Union City, CA, USA"],"raw_orcid":"https://orcid.org/0009-0005-5320-2065","affiliations":[{"raw_affiliation_string":"Sr. Applications Developer/Architect, ESRI, Redlands, CA, USA","institution_ids":["https://openalex.org/I1292842697"]},{"raw_affiliation_string":"Senior Application Architect, Union City, CA, USA","institution_ids":["https://openalex.org/I155673605"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5092472890"],"corresponding_institution_ids":["https://openalex.org/I4210088288"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.40930942,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"13","issue":null,"first_page":"205854","last_page":"205864"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8776000142097473,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.8776000142097473,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.013000000268220901,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.009100000374019146,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7172999978065491},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.44769999384880066},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.43860000371932983},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.4318000078201294},{"id":"https://openalex.org/keywords/analytics","display_name":"Analytics","score":0.4163999855518341},{"id":"https://openalex.org/keywords/sentiment-analysis","display_name":"Sentiment analysis","score":0.4108999967575073},{"id":"https://openalex.org/keywords/data-modeling","display_name":"Data modeling","score":0.3797000050544739}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8709999918937683},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7172999978065491},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4893999993801117},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.44769999384880066},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.43860000371932983},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.4318000078201294},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.4163999855518341},{"id":"https://openalex.org/C66402592","wikidata":"https://www.wikidata.org/wiki/Q2271421","display_name":"Sentiment analysis","level":2,"score":0.4108999967575073},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.3797000050544739},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3725999891757965},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.3391999900341034},{"id":"https://openalex.org/C92446256","wikidata":"https://www.wikidata.org/wiki/Q3306762","display_name":"Data validation","level":2,"score":0.335099995136261},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.3287000060081482},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3192000091075897},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.31779998540878296},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C72634772","wikidata":"https://www.wikidata.org/wiki/Q386824","display_name":"Data integration","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.27959999442100525},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/access.2025.3638723","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3638723","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1109/access.2025.3638723","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2025.3638723","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2083755493","https://openalex.org/W2103018059","https://openalex.org/W2202199124","https://openalex.org/W2575989903","https://openalex.org/W2911299907","https://openalex.org/W2991245758","https://openalex.org/W4213251304","https://openalex.org/W4408295312","https://openalex.org/W4409840763"],"related_works":[],"abstract_inverted_index":{"The":[0,90,126],"rapid":[1],"growth":[2],"of":[3],"big":[4],"data":[5,11,18,87,138,213],"demands":[6],"robust":[7],"frameworks":[8],"to":[9,84,104],"ensure":[10],"quality":[12,19,88,139,214],"in":[13,22,34,79,107,172],"distributed":[14],"platforms,":[15],"as":[16],"poor":[17],"undermines":[20],"insights":[21],"domains":[23],"like":[24],"e-commerce":[25],"and":[26,60,114,120,133,147,155,175,193],"social":[27],"media":[28],"analytics.":[29],"Traditional":[30],"PySpark-based":[31],"validation":[32,56,78,122,184],"excels":[33],"syntactic":[35,173,183],"checks":[36],"(e.g.,":[37,45],"pattern":[38,143,189],"violations)":[39],"but":[40,57],"struggles":[41],"with":[42,75],"semantic":[43,145,198],"anomalies":[44],"sentiment":[46,148,154],"drift,":[47],"null":[48,141],"detection).":[49],"Large":[50],"Language":[51],"Models":[52],"(LLMs)":[53],"offer":[54],"context-aware":[55],"face":[58],"scalability":[59,110,166],"cost":[61],"challenges.":[62],"This":[63],"study":[64],"proposes":[65],"an":[66],"Adaptive":[67],"Hybrid":[68,194],"framework":[69,91,161],"that":[70],"integrates":[71],"PySpark\u2019s":[72],"rule-based":[73],"efficiency":[74],"AI-powered":[76],"contextual":[77],"Microsoft":[80],"Fabric\u2019s":[81],"Lakehouse":[82],"environment":[83],"deliver":[85],"high-performance":[86],"assessment.":[89],"uses":[92],"a":[93,217],"composite":[94],"performance":[95],"score":[96],"(F":[97],"=":[98,112,186],"0.4A":[99],"+":[100],"0.4S":[101],"\u2212":[102],"0.2E)":[103],"quantify":[105],"trade-offs":[106],"accuracy":[108,164],"(A),":[109],"(S":[111],"1/T),":[113],"development":[115],"effort":[116,177],"(E),":[117],"ensuring":[118],"reliable":[119],"efficient":[121],"without":[123],"runtime":[124],"optimization.":[125],"weights":[127],"reflect":[128],"enterprise":[129],"priorities":[130],"for":[131,169,188,201,221],"precision":[132],"throughput.":[134],"Evaluated":[135],"across":[136],"four":[137],"scenarios":[140],"detection,":[142],"violations,":[144],"mismatches,":[146],"drift":[149],"using":[150],"real-world":[151],"Twitter":[152],"airline":[153],"Amazon":[156],"product":[157],"review":[158],"datasets,":[159],"the":[160],"achieves":[162],"high":[163],"(~0.9),":[165],"(1.898":[167],"s":[168],"5,000":[170],"rows":[171],"tasks),":[174],"low":[176],"(E":[178],"\u2248":[179],"0.3).":[180],"PySpark":[181],"dominates":[182],"(F1-score":[185],"0.791":[187],"violations),":[190],"while":[191],"AI":[192],"methods":[195],"significantly":[196],"improve":[197],"recall":[199],"(0.674":[200],"mismatches).":[202],"Unlike":[203],"prior":[204],"syntactic-only":[205],"or":[206],"text-classification":[207],"frameworks,":[208],"this":[209],"approach":[210],"bridges":[211],"structured":[212],"gaps,":[215],"offering":[216],"scalable,":[218],"intelligent":[219],"solution":[220],"modern":[222],"analytics":[223],"platforms.":[224]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-28T00:00:00"}
