{"id":"https://openalex.org/W2126888240","doi":"https://doi.org/10.1145/1390749.1390766","title":"Topic based language models for OCR correction","display_name":"Topic based language models for OCR correction","publication_year":2008,"publication_date":"2008-07-24","ids":{"openalex":"https://openalex.org/W2126888240","doi":"https://doi.org/10.1145/1390749.1390766","mag":"2126888240"},"language":"en","primary_location":{"id":"doi:10.1145/1390749.1390766","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1390749.1390766","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the second workshop on Analytics for noisy unstructured text data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102494018","display_name":"Anurag Bhardwaj","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Anurag Bhardwaj","raw_affiliation_strings":["University at Buffalo, Amherst, NY"],"affiliations":[{"raw_affiliation_string":"University at Buffalo, Amherst, NY","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025508406","display_name":"Faisal Farooq","orcid":"https://orcid.org/0000-0002-3551-7371"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Faisal Farooq","raw_affiliation_strings":["University at Buffalo, Amherst, NY"],"affiliations":[{"raw_affiliation_string":"University at Buffalo, Amherst, NY","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108504544","display_name":"Huaigu Cao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huaigu Cao","raw_affiliation_strings":["University at Buffalo, Amherst, NY"],"affiliations":[{"raw_affiliation_string":"University at Buffalo, Amherst, NY","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5020354604","display_name":"Venu Govindaraju","orcid":"https://orcid.org/0000-0002-5318-7409"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Venu Govindaraju","raw_affiliation_strings":["University at Buffalo, Amherst, NY"],"affiliations":[{"raw_affiliation_string":"University at Buffalo, Amherst, NY","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5102494018"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7559,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.864759,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"107","last_page":"112"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14339","display_name":"Image Processing and 3D Reconstruction","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9866999983787537,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8717037439346313},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6966439485549927},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.6603417992591858},{"id":"https://openalex.org/keywords/lexicon","display_name":"Lexicon","score":0.5817746520042419},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.5653992891311646},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5365318059921265},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5327408909797668},{"id":"https://openalex.org/keywords/principle-of-maximum-entropy","display_name":"Principle of maximum entropy","score":0.5224936604499817},{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.44593149423599243},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.3470950126647949},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.08047646284103394}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8717037439346313},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6966439485549927},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6603417992591858},{"id":"https://openalex.org/C2778121359","wikidata":"https://www.wikidata.org/wiki/Q8096","display_name":"Lexicon","level":2,"score":0.5817746520042419},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.5653992891311646},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5365318059921265},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5327408909797668},{"id":"https://openalex.org/C9679016","wikidata":"https://www.wikidata.org/wiki/Q1417473","display_name":"Principle of maximum entropy","level":2,"score":0.5224936604499817},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.44593149423599243},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3470950126647949},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.08047646284103394},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1390749.1390766","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1390749.1390766","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the second workshop on Analytics for noisy unstructured text data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.5600000023841858,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W561535998","https://openalex.org/W1532325895","https://openalex.org/W1589867711","https://openalex.org/W1909199407","https://openalex.org/W1943140021","https://openalex.org/W1969984959","https://openalex.org/W1974071875","https://openalex.org/W1992152861","https://openalex.org/W2010595692","https://openalex.org/W2038522412","https://openalex.org/W2071003614","https://openalex.org/W2088622394","https://openalex.org/W2099156541","https://openalex.org/W2101911356","https://openalex.org/W2116629521","https://openalex.org/W2152928267","https://openalex.org/W2153231234","https://openalex.org/W2157224915","https://openalex.org/W2163374925","https://openalex.org/W2168868236","https://openalex.org/W2171498274","https://openalex.org/W4210597082","https://openalex.org/W4213009331","https://openalex.org/W6615684021"],"related_works":["https://openalex.org/W2165912799","https://openalex.org/W2735662278","https://openalex.org/W2382615723","https://openalex.org/W4311804456","https://openalex.org/W1987484445","https://openalex.org/W2623658258","https://openalex.org/W2140536630","https://openalex.org/W3195005284","https://openalex.org/W2944691285","https://openalex.org/W2047632477"],"abstract_inverted_index":{"Despite":[0],"several":[1],"decades":[2],"of":[3,9,42,49,59,117,142,182],"research":[4,20,61],"in":[5,21,40,168],"document":[6,88,144,190],"analysis,":[7],"recognition":[8,71,134,171],"unconstrained":[10,51],"handwritten":[11,52,189],"documents":[12],"is":[13,55,62,94,106,110,126,136,154],"still":[14],"considered":[15],"a":[16,36,44,74,81,90,118,157,178],"challenging":[17],"task.":[18],"Previous":[19],"this":[22,60],"area":[23],"has":[24],"shown":[25],"that":[26],"word":[27,53,124,133,170,184],"recognizers":[28],"produce":[29],"reasonably":[30],"clean":[31],"output":[32,48,67],"when":[33],"used":[34,111],"with":[35],"restricted":[37,45],"lexicon.":[38],"But":[39],"absence":[41],"such":[43],"lexicon,":[46],"the":[47,114,129,143,169],"an":[50],"recognizer":[54,66,130],"noisy.":[56],"The":[57,151],"objective":[58],"to":[63,112,175],"process":[64],"noisy":[65],"and":[68,131,145,162],"eliminate":[69],"spurious":[70],"choices":[72],"using":[73,89],"topic":[75,82,98,115,140,146],"based":[76,83,101,147],"language":[77,84,148],"model.":[78],"We":[79],"construct":[80],"model":[85,105,149],"for":[86],"every":[87],"training":[91],"data":[92],"which":[93,109],"manually":[95],"categorized.":[96],"A":[97,121],"categorization":[99],"sub-system":[100],"on":[102,156],"Maximum":[103],"Entropy":[104],"also":[107],"trained":[108],"generate":[113],"distribution":[116,141],"test":[119,123,179],"document.":[120],"given":[122],"image":[125],"processed":[127],"by":[128,138],"its":[132],"likelihood":[135],"refined":[137],"incorporating":[139],"probability.":[150],"proposed":[152],"method":[153],"evaluated":[155],"publicly":[158],"available":[159],"IAM":[160],"dataset":[161],"experimental":[163],"results":[164],"show":[165],"significant":[166],"improvement":[167],"accuracy":[172],"from":[173,187],"32%":[174],"40%":[176],"over":[177],"set":[180],"consisting":[181],"4033":[183],"images":[185],"extracted":[186],"70":[188],"images.":[191]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2016,"cited_by_count":1},{"year":2014,"cited_by_count":3},{"year":2013,"cited_by_count":2},{"year":2012,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
