{"id":"https://openalex.org/W3014560542","doi":"https://doi.org/10.1145/3383583.3398542","title":"The Case For Alternative Web Archival Formats To Expedite The Data-To-Insight Cycle","display_name":"The Case For Alternative Web Archival Formats To Expedite The Data-To-Insight Cycle","publication_year":2020,"publication_date":"2020-08-01","ids":{"openalex":"https://openalex.org/W3014560542","doi":"https://doi.org/10.1145/3383583.3398542","mag":"3014560542"},"language":"en","primary_location":{"id":"doi:10.1145/3383583.3398542","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3383583.3398542","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3383583.3398542","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3383583.3398542","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Xinyue Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xinyue Wang","raw_affiliation_strings":["Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Virginia Polytechnic Institute and State University, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]},{"author_position":"last","author":{"id":null,"display_name":"Zhiwu Xie","orcid":null},"institutions":[{"id":"https://openalex.org/I859038795","display_name":"Virginia Tech","ror":"https://ror.org/02smfhw86","country_code":"US","type":"education","lineage":["https://openalex.org/I859038795"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zhiwu Xie","raw_affiliation_strings":["Virginia Polytechnic Institute and State University, Blacksburg, VA, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Virginia Polytechnic Institute and State University, Blacksburg, VA, USA","institution_ids":["https://openalex.org/I859038795"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.6483,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":{"value":0.71313267,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"177","last_page":"186"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.5986999869346619},{"id":"https://openalex.org/keywords/web-service","display_name":"Web service","score":0.36640000343322754},{"id":"https://openalex.org/keywords/file-format","display_name":"File format","score":0.3644999861717224},{"id":"https://openalex.org/keywords/web-application","display_name":"Web application","score":0.32919999957084656},{"id":"https://openalex.org/keywords/backward-compatibility","display_name":"Backward compatibility","score":0.2727999985218048}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6481000185012817},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.5986999869346619},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5684000253677368},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.3790000081062317},{"id":"https://openalex.org/C35578498","wikidata":"https://www.wikidata.org/wiki/Q193424","display_name":"Web service","level":2,"score":0.36640000343322754},{"id":"https://openalex.org/C97250363","wikidata":"https://www.wikidata.org/wiki/Q235557","display_name":"File format","level":2,"score":0.3644999861717224},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.32919999957084656},{"id":"https://openalex.org/C20574231","wikidata":"https://www.wikidata.org/wiki/Q844605","display_name":"Backward compatibility","level":2,"score":0.2727999985218048},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.2581000030040741},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.24899999797344208}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3383583.3398542","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3383583.3398542","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3383583.3398542","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2003.14046","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2003.14046","pdf_url":"https://arxiv.org/pdf/2003.14046","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:vtechworks.lib.vt.edu:10919/98565","is_oa":true,"landing_page_url":"http://hdl.handle.net/10919/98565","pdf_url":null,"source":{"id":"https://openalex.org/S4306400248","display_name":"VTechWorks (Virginia Tech)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I859038795","host_organization_name":"Virginia Tech","host_organization_lineage":["https://openalex.org/I859038795"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-nc-nd","license_id":"https://openalex.org/licenses/cc-by-nc-nd","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Text"}],"best_oa_location":{"id":"doi:10.1145/3383583.3398542","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3383583.3398542","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3383583.3398542","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM/IEEE Joint Conference on Digital Libraries in 2020","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2037717774","display_name":"III: Small: Collaborative Research: Global Event and Trend Archive Research (GETAR)","funder_award_id":"1619028","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3223590597","display_name":null,"funder_award_id":"IIS-1619028","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G5856991230","display_name":null,"funder_award_id":"1619028 and 1619371","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G609424727","display_name":null,"funder_award_id":"LG-71-16-0037-16","funder_id":"https://openalex.org/F4320306122","funder_display_name":"Institute of Museum and Library Services"},{"id":"https://openalex.org/G7886008252","display_name":"III: Small: Collaborative Research: Global Event and Trend Archive Research (GETAR)","funder_award_id":"1619371","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306122","display_name":"Institute of Museum and Library Services","ror":"https://ror.org/030prv062"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3014560542.pdf","grobid_xml":"https://content.openalex.org/works/W3014560542.grobid-xml"},"referenced_works_count":31,"referenced_works":["https://openalex.org/W90672828","https://openalex.org/W1519919627","https://openalex.org/W1630502859","https://openalex.org/W1989017925","https://openalex.org/W2013455564","https://openalex.org/W2019183416","https://openalex.org/W2074935284","https://openalex.org/W2097411811","https://openalex.org/W2114303224","https://openalex.org/W2145549626","https://openalex.org/W2155970976","https://openalex.org/W2157004265","https://openalex.org/W2167378365","https://openalex.org/W2195858147","https://openalex.org/W2293827470","https://openalex.org/W2346198741","https://openalex.org/W2464674082","https://openalex.org/W2577232253","https://openalex.org/W2740290295","https://openalex.org/W2765587785","https://openalex.org/W2789998316","https://openalex.org/W2896555078","https://openalex.org/W2945842971","https://openalex.org/W4232719369","https://openalex.org/W4236875673","https://openalex.org/W4251909731","https://openalex.org/W4382682204","https://openalex.org/W6675589627","https://openalex.org/W6738164040","https://openalex.org/W6754502429","https://openalex.org/W6906355099"],"related_works":[],"abstract_inverted_index":{"The":[0],"WARC":[1,59,114,146],"file":[2],"format":[3,60],"is":[4],"widely":[5],"used":[6],"by":[7,112],"web":[8,13,23,140,150],"archives":[9,24,32],"to":[10,29,44,77,90,103,143],"preserve":[11],"collected":[12],"content":[14],"for":[15,37,65,130,138],"future":[16],"use.":[17],"With":[18],"the":[19,26,42,58,71,134,139],"rapid":[20],"growth":[21],"of":[22,74,101,106],"and":[25,39,82],"increasing":[27],"interest":[28],"reuse":[30],"these":[31,46,75,94,122],"as":[33],"big":[34],"data":[35,47,79],"sources":[36],"statistical":[38],"analytical":[40],"research,":[41],"speed":[43],"turn":[45],"into":[48,116],"insights":[49],"becomes":[50],"critical.":[51],"In":[52],"this":[53],"paper":[54],"we":[55],"show":[56],"that":[57],"carries":[61],"significant":[62],"performance":[63,99],"penalties":[64,76],"batch":[66],"processing":[67],"workload.":[68],"We":[69,85],"trace":[70],"root":[72],"cause":[73],"its":[78],"structure,":[80],"encoding,":[81],"addressing":[83],"method.":[84],"then":[86],"run":[87],"controlled":[88],"experiments":[89],"illustrate":[91],"how":[92],"severe":[93],"problems":[95],"can":[96,108],"be.":[97],"Indeed,":[98],"gain":[100],"one":[102],"two":[104],"orders":[105],"magnitude":[107],"be":[109],"achieved":[110],"simply":[111],"reformatting":[113],"files":[115],"Parquet":[117],"or":[118,132],"Avro":[119,131],"formats.":[120,152],"While":[121],"results":[123],"do":[124],"not":[125],"necessarily":[126],"constitute":[127],"an":[128],"endorsement":[129],"Parquet,":[133],"time":[135],"has":[136],"come":[137],"archiving":[141],"community":[142],"consider":[144],"replacing":[145],"with":[147],"more":[148],"efficient":[149],"archival":[151]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2020-04-10T00:00:00"}
