{"id":"https://openalex.org/W4386044703","doi":"https://doi.org/10.48550/arxiv.2308.09004","title":"Towards Lightweight Data Integration using Multi-workflow Provenance and Data Observability","display_name":"Towards Lightweight Data Integration using Multi-workflow Provenance and Data Observability","publication_year":2023,"publication_date":"2023-08-17","ids":{"openalex":"https://openalex.org/W4386044703","doi":"https://doi.org/10.48550/arxiv.2308.09004"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2308.09004","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.09004","pdf_url":"https://arxiv.org/pdf/2308.09004","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2308.09004","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042209170","display_name":"Renan P. Souza","orcid":"https://orcid.org/0000-0002-9479-4432"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Souza, Renan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079278437","display_name":"Tyler J. Skluzacek","orcid":"https://orcid.org/0000-0003-2242-4931"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Skluzacek, Tyler J.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034261249","display_name":"Sean R. Wilkinson","orcid":"https://orcid.org/0000-0002-1443-7479"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wilkinson, Sean R.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013879711","display_name":"Maxim Ziatdinov","orcid":"https://orcid.org/0000-0003-2570-4592"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziatdinov, Maxim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5072339196","display_name":"Rafael Ferreira da Silva","orcid":"https://orcid.org/0000-0002-1720-0928"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"da Silva, Rafael Ferreira","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9715999960899353,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.968999981880188,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.7960948944091797},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7625384330749512},{"id":"https://openalex.org/keywords/observability","display_name":"Observability","score":0.752508282661438},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5591882467269897},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.5564247369766235},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5501575469970703},{"id":"https://openalex.org/keywords/software-engineering","display_name":"Software engineering","score":0.375308632850647},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.36171236634254456},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.3517967462539673},{"id":"https://openalex.org/keywords/systems-engineering","display_name":"Systems engineering","score":0.32459908723831177},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.21314790844917297},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.10187867283821106}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.7960948944091797},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7625384330749512},{"id":"https://openalex.org/C36299963","wikidata":"https://www.wikidata.org/wiki/Q1369844","display_name":"Observability","level":2,"score":0.752508282661438},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5591882467269897},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.5564247369766235},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5501575469970703},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.375308632850647},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.36171236634254456},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3517967462539673},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.32459908723831177},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.21314790844917297},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10187867283821106},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2308.09004","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.09004","pdf_url":"https://arxiv.org/pdf/2308.09004","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2308.09004","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2308.09004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article-journal"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2308.09004","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.09004","pdf_url":"https://arxiv.org/pdf/2308.09004","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.5199999809265137}],"awards":[{"id":"https://openalex.org/G1489425746","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G1645119126","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G1719536385","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G4423657506","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G675320460","display_name":null,"funder_award_id":"DE-AC05-00OR22725","funder_id":"https://openalex.org/F4320338287","funder_display_name":"Oak Ridge National Laboratory"},{"id":"https://openalex.org/G7995982022","display_name":null,"funder_award_id":"DE-AC05","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G8269158468","display_name":null,"funder_award_id":"AC05-00OR22725","funder_id":"https://openalex.org/F4320332359","funder_display_name":"Office of Science"},{"id":"https://openalex.org/G8906985441","display_name":null,"funder_award_id":"00OR22725","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320332359","display_name":"Office of Science","ror":"https://ror.org/00mmn6b08"},{"id":"https://openalex.org/F4320338287","display_name":"Oak Ridge National Laboratory","ror":"https://ror.org/01qz5mb56"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386044703.pdf","grobid_xml":"https://content.openalex.org/works/W4386044703.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2046459260","https://openalex.org/W2765830098","https://openalex.org/W2967463586","https://openalex.org/W1971989957","https://openalex.org/W2517338020","https://openalex.org/W3157641275","https://openalex.org/W4312300846","https://openalex.org/W4206221578","https://openalex.org/W3029572990","https://openalex.org/W2615757685"],"abstract_inverted_index":{"Modern":[0],"large-scale":[1],"scientific":[2,29],"discovery":[3],"requires":[4],"multidisciplinary":[5],"collaboration":[6],"across":[7],"diverse":[8],"computing":[9],"facilities,":[10],"including":[11],"High":[12],"Performance":[13],"Computing":[14],"(HPC)":[15],"machines":[16],"and":[17,44,64,75,94,101,121,146],"the":[18,33,48,109,112,185],"Edge-to-Cloud":[19],"continuum.":[20],"Integrated":[21,86],"data":[22,70,91,123,143],"analysis":[23,141],"plays":[24],"a":[25,127,149],"crucial":[26],"role":[27],"in":[28,32,111,148,169],"discovery,":[30],"especially":[31],"current":[34],"AI":[35,40],"era,":[36],"by":[37],"enabling":[38],"Responsible":[39],"development,":[41],"FAIR,":[42],"Reproducibility,":[43],"User":[45],"Steering.":[46],"However,":[47],"heterogeneous":[49],"nature":[50],"of":[51],"science":[52,158],"poses":[53],"challenges":[54],"such":[55],"as":[56],"dealing":[57],"with":[58,164],"multiple":[59,162],"supporting":[60],"tools,":[61],"cross-facility":[62],"environments,":[63],"efficient":[65],"HPC":[66],"execution.":[67],"Building":[68],"on":[69,161,180,184],"observability,":[71,106],"adapter":[72],"system":[73],"design,":[74],"provenance,":[76,120],"we":[77],"propose":[78],"MIDA:":[79],"an":[80],"approach":[81],"for":[82,97,131,156],"lightweight":[83],"runtime":[84,125],"Multi-workflow":[85],"Data":[87],"Analysis.":[88],"MIDA":[89],"defines":[90],"observability":[92],"strategies":[93],"adaptability":[95],"methods":[96],"various":[98],"parallel":[99],"systems":[100],"machine":[102],"learning":[103,153],"tools.":[104],"With":[105],"it":[107],"intercepts":[108],"dataflows":[110],"background":[113],"without":[114],"requiring":[115],"instrumentation":[116],"while":[117],"integrating":[118,142],"domain,":[119],"telemetry":[122],"at":[124],"into":[126],"unified":[128],"database":[129],"ready":[130],"user":[132],"steering":[133],"queries.":[134],"We":[135,171],"conduct":[136],"experiments":[137],"showing":[138],"end-to-end":[139],"multi-workflow":[140],"from":[144],"Dask":[145],"MLFlow":[147],"real":[150],"distributed":[151],"deep":[152],"use":[154],"case":[155],"materials":[157],"that":[159],"runs":[160],"environments":[163],"up":[165,176],"to":[166,177],"276":[167],"GPUs":[168],"parallel.":[170],"show":[172],"near-zero":[173],"overhead":[174],"running":[175],"100,000":[178],"tasks":[179],"1,680":[181],"CPU":[182],"cores":[183],"Summit":[186],"supercomputer.":[187]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-16T09:24:06.705377","created_date":"2023-08-22T00:00:00"}
