{"id":"https://openalex.org/W4281713691","doi":"https://doi.org/10.1145/3529372.3533298","title":"A prototype gutenberg-hathitrust sentence-level parallel corpus for OCR error analysis","display_name":"A prototype gutenberg-hathitrust sentence-level parallel corpus for OCR error analysis","publication_year":2022,"publication_date":"2022-06-06","ids":{"openalex":"https://openalex.org/W4281713691","doi":"https://doi.org/10.1145/3529372.3533298"},"language":"en","primary_location":{"id":"doi:10.1145/3529372.3533298","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3529372.3533298","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd ACM/IEEE Joint Conference on Digital Libraries","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050468340","display_name":"Ming Jiang","orcid":"https://orcid.org/0000-0002-3604-166X"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ming Jiang","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026851708","display_name":"Ryan Dubnicek","orcid":"https://orcid.org/0000-0001-7153-7030"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ryan C Dubnicek","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067651242","display_name":"Glen Worthey","orcid":"https://orcid.org/0000-0003-2785-0040"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Glen Worthey","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5006779213","display_name":"Ted Underwood","orcid":"https://orcid.org/0000-0001-8960-1846"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ted Underwood","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016957740","display_name":"J. Stephen Downie","orcid":"https://orcid.org/0000-0001-9784-5090"},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"J. Stephen Downie","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.0581,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92817155,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":"12036","issue":null,"first_page":"1","last_page":"5"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.9851999878883362,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.978600025177002,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11937","display_name":"Research Data Management Practices","score":0.9736999869346619,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8033925890922546},{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.6701935529708862},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6646907925605774},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.6149333715438843},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.6097815036773682},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.6023660898208618},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.535914421081543},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.4659985303878784},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4378350079059601},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.43226301670074463},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.21252873539924622},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.19608613848686218}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8033925890922546},{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.6701935529708862},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6646907925605774},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.6149333715438843},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.6097815036773682},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.6023660898208618},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.535914421081543},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.4659985303878784},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4378350079059601},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.43226301670074463},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.21252873539924622},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.19608613848686218},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3529372.3533298","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3529372.3533298","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd ACM/IEEE Joint Conference on Digital Libraries","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.75}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1482504747","https://openalex.org/W1485880740","https://openalex.org/W1547024747","https://openalex.org/W1990871427","https://openalex.org/W2053050503","https://openalex.org/W2069172670","https://openalex.org/W2158794898","https://openalex.org/W2790740913","https://openalex.org/W2798485145","https://openalex.org/W2968567788","https://openalex.org/W3015310959","https://openalex.org/W3046591412","https://openalex.org/W3100600904","https://openalex.org/W3106274667","https://openalex.org/W3157745755","https://openalex.org/W3178709966","https://openalex.org/W3212092655","https://openalex.org/W4255111644"],"related_works":["https://openalex.org/W2392768766","https://openalex.org/W2058118494","https://openalex.org/W2095118173","https://openalex.org/W2382021449","https://openalex.org/W2104269053","https://openalex.org/W2106424170","https://openalex.org/W2501188010","https://openalex.org/W4299935056","https://openalex.org/W2360644719","https://openalex.org/W2385471969"],"abstract_inverted_index":{"This":[0],"exploratory":[1],"study":[2,197],"proposes":[3],"a":[4,112,131,158],"prototype":[5],"sentence-level":[6],"parallel":[7],"corpus":[8,63],"to":[9,45,95,156,171],"support":[10,98],"studying":[11],"optical":[12],"character":[13],"recognition":[14],"(OCR)":[15],"quality":[16,200],"in":[17,89,128,195],"curated":[18],"digitized":[19],"library":[20],"collections.":[21],"Existing":[22],"data":[23,127,150,190],"resources,":[24],"such":[25,37],"as":[26,38],"ICDAR2019[21]":[27],"and":[28,56,71,81],"GT4HistOCR[23],":[29],"generally":[30],"aligned":[31,82],"content":[32],"by":[33],"artifact":[34],"publishing":[35],"characteristics":[36,191],"documents":[39],"or":[40],"lines,":[41],"which":[42],"is":[43,169],"limited":[44],"explore":[46],"OCR":[47,73,102,109,199],"noise":[48],"concentrating":[49],"on":[50,101,115,192],"natural":[51],"language":[52],"granularity":[53],"like":[54],"sentences":[55,84,141],"chapters.":[57],"Building":[58],"upon":[59],"an":[60,106],"existing":[61],"volume-aligned":[62],"that":[64,125,181],"collected":[65],"human-proofread":[66],"texts":[67],"from":[68,75,85,93,142,152,166],"Project":[69],"Gutenberg":[70],"paired":[72],"views":[74],"HathiTrust":[76],"Digital":[77],"Library,":[78],"we":[79,104,179],"extracted":[80],"167,079":[83],"189":[86],"sampled":[87,126,151],"books":[88],"four":[90],"domains":[91],"published":[92],"1793":[94],"1984.":[96],"To":[97],"downstream":[99],"research":[100],"quality,":[103],"conducted":[105],"analysis":[107],"of":[108,134,161,188,198],"errors":[110,136],"with":[111,118],"specific":[113],"focus":[114],"their":[116,193],"associations":[117],"the":[119,185,196],"source":[120,189],"text":[121],"metadata.":[122],"We":[123],"found":[124],"agriculture":[129],"has":[130],"higher":[132],"ratio":[133,160],"real-word":[135,174],"than":[137],"other":[138],"domains,":[139],"while":[140,164],"social-science":[143],"volumes":[144,154,168],"contain":[145],"more":[146,173],"non-word":[147,162],"errors.":[148,175],"Besides,":[149],"early-age":[153],"tend":[155],"have":[157,172],"high":[159],"errors,":[163],"samples":[165],"recently-published":[167],"likely":[170],"Following":[176],"our":[177],"findings,":[178],"suggest":[180],"scholars":[182],"should":[183],"consider":[184],"potential":[186],"influence":[187],"findings":[194],"issues.":[201]},"counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
