{"id":"https://openalex.org/W4285100304","doi":"https://doi.org/10.1145/3539781.3539795","title":"Toward a big data analysis system for historical newspaper collections research","display_name":"Toward a big data analysis system for historical newspaper collections research","publication_year":2022,"publication_date":"2022-06-27","ids":{"openalex":"https://openalex.org/W4285100304","doi":"https://doi.org/10.1145/3539781.3539795"},"language":"en","primary_location":{"id":"doi:10.1145/3539781.3539795","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3539781.3539795","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Platform for Advanced Scientific Computing Conference","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5076155332","display_name":"Sandeep Puthanveetil Satheesan","orcid":"https://orcid.org/0000-0001-9075-3740"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sandeep Puthanveetil Satheesan","raw_affiliation_strings":["University of Illinois at Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054430588","display_name":"Bhavya","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Bhavya","raw_affiliation_strings":["University of Illinois at Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000722483","display_name":"Adam Davies","orcid":"https://orcid.org/0000-0002-0610-2732"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Adam Davies","raw_affiliation_strings":["University of Illinois at Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034113478","display_name":"Alan B. Craig","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alan B. Craig","raw_affiliation_strings":["Discovery Environment"],"affiliations":[{"raw_affiliation_string":"Discovery Environment","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100433414","display_name":"Yu Zhang","orcid":"https://orcid.org/0000-0001-7889-2676"},"institutions":[{"id":"https://openalex.org/I127339247","display_name":"California State University System","ror":"https://ror.org/020qm1538","country_code":"US","type":"education","lineage":["https://openalex.org/I127339247"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yu Zhang","raw_affiliation_strings":["California State University"],"affiliations":[{"raw_affiliation_string":"California State University","institution_ids":["https://openalex.org/I127339247"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5028518494","display_name":"ChengXiang Zhai","orcid":"https://orcid.org/0000-0002-6434-3702"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"ChengXiang Zhai","raw_affiliation_strings":["University of Illinois at Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois at Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5076155332"],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":1.5157,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.86046426,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12034","display_name":"Digital and Cyber Forensics","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.984000027179718,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11241","display_name":"Advanced Malware Detection Techniques","score":0.9787999987602234,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/newspaper","display_name":"Newspaper","score":0.9167097806930542},{"id":"https://openalex.org/keywords/terabyte","display_name":"Terabyte","score":0.8385167121887207},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7368241548538208},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.6856592893600464},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.5988646745681763},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5208523869514465},{"id":"https://openalex.org/keywords/social-media","display_name":"Social media","score":0.45989304780960083},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.44821280241012573},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.37944239377975464},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.1528303623199463},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.14048632979393005}],"concepts":[{"id":"https://openalex.org/C201280247","wikidata":"https://www.wikidata.org/wiki/Q11032","display_name":"Newspaper","level":2,"score":0.9167097806930542},{"id":"https://openalex.org/C199683683","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Terabyte","level":2,"score":0.8385167121887207},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7368241548538208},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.6856592893600464},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.5988646745681763},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5208523869514465},{"id":"https://openalex.org/C518677369","wikidata":"https://www.wikidata.org/wiki/Q202833","display_name":"Social media","level":2,"score":0.45989304780960083},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.44821280241012573},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.37944239377975464},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.1528303623199463},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.14048632979393005},{"id":"https://openalex.org/C112698675","wikidata":"https://www.wikidata.org/wiki/Q37038","display_name":"Advertising","level":1,"score":0.0},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3539781.3539795","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3539781.3539795","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Platform for Advanced Scientific Computing Conference","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4000000059604645,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W20077218","https://openalex.org/W89374220","https://openalex.org/W1861492603","https://openalex.org/W1876542659","https://openalex.org/W1966592756","https://openalex.org/W1969767174","https://openalex.org/W2001642682","https://openalex.org/W2031342552","https://openalex.org/W2043701535","https://openalex.org/W2046436249","https://openalex.org/W2108705671","https://openalex.org/W2129154270","https://openalex.org/W2206826311","https://openalex.org/W2250539671","https://openalex.org/W2497005299","https://openalex.org/W2513522215","https://openalex.org/W2875407561","https://openalex.org/W2878884661","https://openalex.org/W2943761765","https://openalex.org/W3008567343","https://openalex.org/W3011870923","https://openalex.org/W3030961769","https://openalex.org/W3045184637","https://openalex.org/W3099178230","https://openalex.org/W3104388511","https://openalex.org/W4230613726","https://openalex.org/W4237342689","https://openalex.org/W4241588219"],"related_works":["https://openalex.org/W2936171637","https://openalex.org/W1586214342","https://openalex.org/W2260589296","https://openalex.org/W3157828377","https://openalex.org/W181157820","https://openalex.org/W3137329302","https://openalex.org/W4241817935","https://openalex.org/W2937168573","https://openalex.org/W2805468299","https://openalex.org/W2990494149"],"abstract_inverted_index":{"The":[0],"availability":[1],"and":[2,42,121,132,153,160,191],"generation":[3],"of":[4,88,102,114,116,174,184],"digitized":[5,24,103],"newspaper":[6,77,104],"collections":[7,147],"have":[8,95],"provided":[9],"researchers":[10,50],"in":[11,60,187,199],"several":[12],"domains":[13],"with":[14,83,163],"a":[15,29,40,85,90],"powerful":[16],"tool":[17],"to":[18,51,74,112,126,140,156,179,196],"advance":[19],"their":[20],"research.":[21],"More":[22],"specifically,":[23],"historical":[25,76,127],"newspapers":[26],"give":[27],"us":[28],"magnifying":[30],"glass":[31],"into":[32],"the":[33,64,97,176,181,188,200],"past.":[34],"In":[35],"this":[36],"paper,":[37],"we":[38,94],"propose":[39],"scalable":[41],"customizable":[43],"big":[44],"data":[45,146],"analysis":[46,72,166],"system":[47,178],"that":[48],"enables":[49],"study":[52,180],"complex":[53],"questions":[54,143],"about":[55],"our":[56,81],"society":[57],"as":[58],"depicted":[59],"news":[61],"media":[62],"for":[63],"past":[65],"few":[66],"centuries":[67],"by":[68,148],"applying":[69,118,149],"cutting-edge":[70],"text":[71],"tools":[73],"large":[75],"collections.":[78],"We":[79,168],"discuss":[80,192],"experience":[82],"building":[84],"preliminary":[86,172],"version":[87],"such":[89,145],"system,":[91],"including":[92],"how":[93],"addressed":[96],"following":[98],"challenges:":[99],"processing":[100],"millions":[101],"pages":[105],"from":[106,144],"various":[107],"publications":[108,134],"worldwide,":[109],"which":[110,129],"amount":[111],"hundreds":[113],"terabytes":[115],"data;":[117],"article":[119],"segmentation":[120],"Optical":[122],"Character":[123],"Recognition":[124],"(OCR)":[125],"newspapers,":[128],"vary":[130],"between":[131],"within":[133],"over":[135],"time;":[136],"retrieving":[137],"relevant":[138],"information":[139],"answer":[141],"research":[142],"human-in-the-loop":[150],"machine":[151],"learning;":[152],"enabling":[154],"users":[155],"analyze":[157],"topic":[158],"evolution":[159],"semantic":[161],"dynamics":[162],"multiple":[164],"compatible":[165],"operators.":[167],"also":[169],"present":[170],"some":[171],"results":[173],"using":[175],"proposed":[177],"social":[182],"construction":[183],"juvenile":[185],"delinquency":[186],"United":[189],"States":[190],"important":[193],"remaining":[194],"challenges":[195],"be":[197],"tackled":[198],"future.":[201]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
