{"id":"https://openalex.org/W2783357846","doi":"https://doi.org/10.1109/bigdata.2017.8257958","title":"Universal distant reading through metadata proxies with archivespark","display_name":"Universal distant reading through metadata proxies with archivespark","publication_year":2017,"publication_date":"2017-12-01","ids":{"openalex":"https://openalex.org/W2783357846","doi":"https://doi.org/10.1109/bigdata.2017.8257958","mag":"2783357846"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata.2017.8257958","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2017.8257958","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080662484","display_name":"Helge Holzmann","orcid":"https://orcid.org/0000-0003-4811-6902"},"institutions":[{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Helge Holzmann","raw_affiliation_strings":["L3S Research Center, Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center, Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091006336","display_name":"Vinay Goel","orcid":null},"institutions":[{"id":"https://openalex.org/I4210124753","display_name":"Internet Archive","ror":"https://ror.org/02z468g17","country_code":"US","type":"archive","lineage":["https://openalex.org/I4210124753"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vinay Goel","raw_affiliation_strings":["Internet Archive 300 Funston Avenue San, Francisco, CA, USA"],"affiliations":[{"raw_affiliation_string":"Internet Archive 300 Funston Avenue San, Francisco, CA, USA","institution_ids":["https://openalex.org/I4210124753"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5043184100","display_name":"Emily Novak Gustainis","orcid":null},"institutions":[{"id":"https://openalex.org/I4210136150","display_name":"L3S Research Center","ror":"https://ror.org/039t4wk02","country_code":"DE","type":"facility","lineage":["https://openalex.org/I114112103","https://openalex.org/I4210136150","https://openalex.org/I94509681"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Emily Novak Gustainis","raw_affiliation_strings":["L3S Research Center, Hannover, Germany"],"affiliations":[{"raw_affiliation_string":"L3S Research Center, Hannover, Germany","institution_ids":["https://openalex.org/I4210136150"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5080662484"],"corresponding_institution_ids":["https://openalex.org/I4210136150"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.24104062,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":"24","issue":null,"first_page":"459","last_page":"464"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.9945999979972839,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8650131225585938},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.775723934173584},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.613871693611145},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.5759474039077759},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5472748279571533},{"id":"https://openalex.org/keywords/digitization","display_name":"Digitization","score":0.5274113416671753},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5254927277565002},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.450031042098999},{"id":"https://openalex.org/keywords/cultural-heritage","display_name":"Cultural heritage","score":0.448299378156662},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4367797076702118},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.435134619474411},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.38917332887649536},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.2510298490524292},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.105568528175354}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8650131225585938},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.775723934173584},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.613871693611145},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5759474039077759},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5472748279571533},{"id":"https://openalex.org/C2779308522","wikidata":"https://www.wikidata.org/wiki/Q843958","display_name":"Digitization","level":2,"score":0.5274113416671753},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5254927277565002},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.450031042098999},{"id":"https://openalex.org/C60671577","wikidata":"https://www.wikidata.org/wiki/Q210272","display_name":"Cultural heritage","level":2,"score":0.448299378156662},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4367797076702118},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.435134619474411},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.38917332887649536},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.2510298490524292},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.105568528175354},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata.2017.8257958","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata.2017.8257958","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2017 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.8700000047683716,"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1975599245","https://openalex.org/W2074144235","https://openalex.org/W2207131393","https://openalex.org/W2339562889","https://openalex.org/W2340680040","https://openalex.org/W2357381334","https://openalex.org/W2417849518","https://openalex.org/W2462694512","https://openalex.org/W2560241432","https://openalex.org/W2735155365","https://openalex.org/W3099990142","https://openalex.org/W3103624702","https://openalex.org/W4300801043","https://openalex.org/W6668892125"],"related_works":["https://openalex.org/W2344247273","https://openalex.org/W2780279436","https://openalex.org/W2786752243","https://openalex.org/W1512334992","https://openalex.org/W4290785358","https://openalex.org/W2387970992","https://openalex.org/W2724898345","https://openalex.org/W907309559","https://openalex.org/W2566805851","https://openalex.org/W4310051385"],"abstract_inverted_index":{"Digitization":[0],"and":[1,14,22,74,119,188,213,221],"the":[2,29,162,216],"large-scale":[3],"preservation":[4],"of":[5,12,31,35,60,147,156,175,205,224],"digitized":[6],"content":[7],"have":[8,75,83],"engendered":[9],"new":[10],"ways":[11],"accessing":[13],"analyzing":[15],"collections":[16,33,62,101],"concurrent":[17],"with":[18,125],"other":[19],"data":[20,61,100,109,186],"mining":[21],"extraction":[23],"efforts.":[24],"Distant":[25],"reading":[26,37,53,134],"refers":[27],"to":[28,90,97,117,153,179,194,218],"analysis":[30],"entire":[32],"instead":[34],"close":[36],"individual":[38],"items":[39],"like":[40,63],"a":[41,131,181,196],"single":[42],"physical":[43],"book":[44],"or":[45,66,107],"electronic":[46],"document.":[47],"The":[48,115],"steps":[49],"performed":[50],"in":[51,95,159],"distant":[52,133],"are":[54,71],"often":[55,76],"common":[56],"across":[57,123,184],"various":[58],"types":[59,187],"books,":[64],"journals,":[65],"web":[67],"archives,":[68,94],"sources":[69,110],"that":[70],"very":[72,126],"valuable":[73],"been":[77],"neglected":[78],"as":[79],"Big":[80],"Data.":[81],"We":[82],"extended":[84],"our":[85],"tool":[86],"ArchiveSpark,":[87],"originally":[88],"designed":[89],"efficiently":[91],"process":[92,195],"Web":[93],"order":[96],"support":[98],"arbitrary":[99],"being":[102],"served":[103],"from":[104,161],"either":[105],"local":[106],"remote":[108],"by":[111],"using":[112],"metadata":[113],"proxies.":[114],"ability":[116],"share":[118],"reuse":[120,172],"researcher":[121],"workflows":[122],"disciplines":[124],"different":[127,197],"datasets":[128],"makes":[129],"ArchiveSpark":[130,192],"universal":[132],"framework.":[135],"In":[136],"this":[137],"paper,":[138],"we":[139],"describe":[140],"ArchiveSpark's":[141],"design":[142],"extensions":[143],"along":[144],"an":[145,191,200],"example":[146],"how":[148,169],"it":[149],"can":[150,171],"be":[151],"leveraged":[152],"analyze":[154],"symptoms":[155],"Polio":[157],"mentioned":[158],"journals":[160],"Medical":[163],"Heritage":[164],"Library.":[165],"Our":[166],"experiments":[167],"demonstrate":[168],"users":[170],"large":[173],"portions":[174],"their":[176],"job":[177,193],"pipeline":[178],"accomplish":[180],"specific":[182],"task":[183],"diverse":[185],"sources.":[189],"Migrating":[190],"dataset":[198],"introduces":[199],"additional":[201],"average":[202],"code":[203],"complexity":[204],"only":[206],"4.8%.":[207],"Its":[208],"expressiveness,":[209],"scalability,":[210],"extensibility,":[211],"reusability,":[212],"efficiency":[214],"has":[215],"potential":[217],"advance":[219],"novel":[220],"rich":[222],"methods":[223],"scholarly":[225],"inquiry.":[226]},"counts_by_year":[{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
