{"id":"https://openalex.org/W3208625967","doi":"https://doi.org/10.1162/tacl_a_00427","title":"Lexically Aware Semi-Supervised Learning for OCR Post-Correction","display_name":"Lexically Aware Semi-Supervised Learning for OCR Post-Correction","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W3208625967","doi":"https://doi.org/10.1162/tacl_a_00427","mag":"3208625967"},"language":"en","primary_location":{"id":"doi:10.1162/tacl_a_00427","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00427","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00427/1974763/tacl_a_00427.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00427/1974763/tacl_a_00427.pdf","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5014428509","display_name":"Shruti Rijhwani","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Shruti Rijhwani","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, USA. srijhwan@cs.cmu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, USA. srijhwan@cs.cmu.edu","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063126517","display_name":"Daisy Rosenblum","orcid":null},"institutions":[{"id":"https://openalex.org/I141945490","display_name":"University of British Columbia","ror":"https://ror.org/03rmrcq20","country_code":"CA","type":"education","lineage":["https://openalex.org/I141945490"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Daisy Rosenblum","raw_affiliation_strings":["University of British Columbia, Canada. daisy.rosenblum@ubc.ca"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"University of British Columbia, Canada. daisy.rosenblum@ubc.ca","institution_ids":["https://openalex.org/I141945490"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013793053","display_name":"Antonios Anastasopoulos","orcid":"https://orcid.org/0000-0002-8544-246X"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Antonios Anastasopoulos","raw_affiliation_strings":["Department of Computer Science, George Mason University, USA. antonis@gmu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computer Science, George Mason University, USA. antonis@gmu.edu","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5068811427","display_name":"Graham Neubig","orcid":"https://orcid.org/0000-0002-2072-3789"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Graham Neubig","raw_affiliation_strings":["Language Technologies Institute, Carnegie Mellon University, USA. gneubig@cs.cmu.edu"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Language Technologies Institute, Carnegie Mellon University, USA. gneubig@cs.cmu.edu","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5013793053","https://openalex.org/A5014428509","https://openalex.org/A5063126517","https://openalex.org/A5068811427"],"corresponding_institution_ids":["https://openalex.org/I141945490","https://openalex.org/I162714631","https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.7761,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":{"value":0.74229665,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"9","issue":null,"first_page":"1285","last_page":"1302"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.896003246307373},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.7030824422836304},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.6394084692001343},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.6252137422561646},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6160316467285156},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.542371928691864},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5124390721321106},{"id":"https://openalex.org/keywords/raw-data","display_name":"Raw data","score":0.4564838409423828},{"id":"https://openalex.org/keywords/vocabulary","display_name":"Vocabulary","score":0.45462939143180847},{"id":"https://openalex.org/keywords/error-detection-and-correction","display_name":"Error detection and correction","score":0.4416988790035248},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.39856576919555664},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32071638107299805},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.21980896592140198},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.08263885974884033}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.896003246307373},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.7030824422836304},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.6394084692001343},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.6252137422561646},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6160316467285156},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.542371928691864},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5124390721321106},{"id":"https://openalex.org/C132964779","wikidata":"https://www.wikidata.org/wiki/Q2110223","display_name":"Raw data","level":2,"score":0.4564838409423828},{"id":"https://openalex.org/C2777601683","wikidata":"https://www.wikidata.org/wiki/Q6499736","display_name":"Vocabulary","level":2,"score":0.45462939143180847},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.4416988790035248},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39856576919555664},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32071638107299805},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.21980896592140198},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.08263885974884033},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1162/tacl_a_00427","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00427","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00427/1974763/tacl_a_00427.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:36c3cbb49dc746b0a2b9638c326f28c0","is_oa":false,"landing_page_url":"https://doaj.org/article/36c3cbb49dc746b0a2b9638c326f28c0","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Transactions of the Association for Computational Linguistics, Vol 9, Pp 1285-1302 (2021)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1162/tacl_a_00427","is_oa":true,"landing_page_url":"https://doi.org/10.1162/tacl_a_00427","pdf_url":"https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl_a_00427/1974763/tacl_a_00427.pdf","source":{"id":"https://openalex.org/S2729999759","display_name":"Transactions of the Association for Computational Linguistics","issn_l":"2307-387X","issn":["2307-387X"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310320244","host_organization_name":"Association for Computational Linguistics","host_organization_lineage":["https://openalex.org/P4310320244"],"host_organization_lineage_names":["Association for Computational Linguistics"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Transactions of the Association for Computational Linguistics","raw_type":"journal-article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.8700000047683716}],"awards":[{"id":"https://openalex.org/G2701169613","display_name":"Discovering and Demonstrating Linguistic Features for Language Documentation","funder_award_id":"1761548","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320306100","display_name":"National Endowment for the Humanities","ror":"https://ror.org/02vdm1p28"},{"id":"https://openalex.org/F4320319880","display_name":"Government of Canada","ror":"https://ror.org/010q4q527"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3208625967.pdf","grobid_xml":"https://content.openalex.org/works/W3208625967.grobid-xml"},"referenced_works_count":61,"referenced_works":["https://openalex.org/W99399284","https://openalex.org/W1934041838","https://openalex.org/W1990334093","https://openalex.org/W2007624857","https://openalex.org/W2015991179","https://openalex.org/W2046932483","https://openalex.org/W2055563597","https://openalex.org/W2064675550","https://openalex.org/W2101210369","https://openalex.org/W2133485975","https://openalex.org/W2141440284","https://openalex.org/W2153916801","https://openalex.org/W2158195707","https://openalex.org/W2188604753","https://openalex.org/W2304113845","https://openalex.org/W2410539690","https://openalex.org/W2470595162","https://openalex.org/W2555428947","https://openalex.org/W2577255746","https://openalex.org/W2606974598","https://openalex.org/W2756610685","https://openalex.org/W2759083144","https://openalex.org/W2759181158","https://openalex.org/W2786672397","https://openalex.org/W2798485145","https://openalex.org/W2887428522","https://openalex.org/W2891526782","https://openalex.org/W2899575547","https://openalex.org/W2915480215","https://openalex.org/W2946663438","https://openalex.org/W2962708992","https://openalex.org/W2963333747","https://openalex.org/W2963352809","https://openalex.org/W2963699608","https://openalex.org/W2963829526","https://openalex.org/W2964029788","https://openalex.org/W2964050662","https://openalex.org/W2964057904","https://openalex.org/W2964165364","https://openalex.org/W2964308564","https://openalex.org/W2976223659","https://openalex.org/W2980063486","https://openalex.org/W2988898388","https://openalex.org/W3003915459","https://openalex.org/W3035032094","https://openalex.org/W3106274667","https://openalex.org/W3111671284","https://openalex.org/W3186194544","https://openalex.org/W3203948445","https://openalex.org/W4210997624","https://openalex.org/W4297823084","https://openalex.org/W6604065652","https://openalex.org/W6679434410","https://openalex.org/W6680150274","https://openalex.org/W6680768788","https://openalex.org/W6685158001","https://openalex.org/W6728771690","https://openalex.org/W6731862131","https://openalex.org/W6763001404","https://openalex.org/W6768222176","https://openalex.org/W6786971739"],"related_works":["https://openalex.org/W3203142394","https://openalex.org/W4302615923","https://openalex.org/W2292997772","https://openalex.org/W1974101135","https://openalex.org/W2351061015","https://openalex.org/W2017509870","https://openalex.org/W2542937328","https://openalex.org/W2980068837","https://openalex.org/W2098872742","https://openalex.org/W2575782020"],"abstract_inverted_index":{"Abstract":[0],"Much":[1],"of":[2,10,40,48,55,109,174,182,189],"the":[3,11,38,46,75,107,129,141,152,172,175,187],"existing":[4],"linguistic":[5],"data":[6],"in":[7,16,128],"many":[8],"languages":[9,170],"world":[12],"is":[13,116],"locked":[14],"away":[15],"non-":[17],"digitized":[18,31],"books":[19],"and":[20,33,163,191],"documents.":[21],"Optical":[22],"character":[23],"recognition":[24,54],"(OCR)":[25],"can":[26],"be":[27,82],"used":[28],"to":[29,74,81,97,102,125],"produce":[30],"text,":[32],"previous":[34],"work":[35],"has":[36],"demonstrated":[37],"utility":[39,173],"neural":[41,142],"post-correction":[42,143],"methods":[43,61],"that":[44,79,93,139],"improve":[45,103],"results":[47],"general-":[49],"purpose":[50],"OCR":[51],"systems":[52],"on":[53,63,119,167],"less-":[56],"well-resourced":[57],"languages.":[58],"However,":[59],"these":[60,99],"rely":[62],"manually":[64],"curated":[65],"post-":[66],"correction":[67],"data,":[68],"which":[69],"are":[70],"relatively":[71],"scarce":[72],"compared":[73],"non-annotated":[76],"raw":[77,100],"images":[78,101],"need":[80],"digitized.":[83],"In":[84,123],"this":[85],"paper,":[86],"we":[87,132,185],"present":[88],"a":[89,111,114,134,146],"semi-supervised":[90],"learning":[91],"method":[92,138],"makes":[94],"it":[95],"possible":[96],"utilize":[98],"performance,":[104],"specifically":[105],"through":[106],"use":[108],"self-training,":[110],"technique":[112],"where":[113,184],"model":[115,144,149],"iteratively":[117],"trained":[118],"its":[120],"own":[121],"outputs.":[122],"addition,":[124],"enforce":[126],"consistency":[127],"recognized":[130,153],"vocabulary,":[131],"introduce":[133],"lexically":[135,192],"aware":[136,193],"decoding":[137,194],"augments":[140],"with":[145,178],"count-based":[147],"language":[148],"constructed":[150],"from":[151],"texts,":[154],"implemented":[155],"using":[156],"weighted":[157],"finite-state":[158],"automata":[159],"(WFSA)":[160],"for":[161,196],"efficient":[162],"effective":[164],"decoding.":[165],"Results":[166],"four":[168],"endangered":[169],"demonstrate":[171],"proposed":[176],"method,":[177],"relative":[179],"error":[180],"reductions":[181],"15%\u201329%,":[183],"find":[186],"combination":[188],"self-training":[190],"essential":[195],"achieving":[197],"consistent":[198],"improvements.1":[199]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3}],"updated_date":"2026-06-05T09:01:59.212387","created_date":"2025-10-10T00:00:00"}
