{"id":"https://openalex.org/W3011970228","doi":"https://doi.org/10.1109/escience.2019.00020","title":"Quality-Aware Human-Machine Text Extraction for Biocollections using Ensembles of OCRs","display_name":"Quality-Aware Human-Machine Text Extraction for Biocollections using Ensembles of OCRs","publication_year":2019,"publication_date":"2019-09-01","ids":{"openalex":"https://openalex.org/W3011970228","doi":"https://doi.org/10.1109/escience.2019.00020","mag":"3011970228"},"language":"en","primary_location":{"id":"doi:10.1109/escience.2019.00020","is_oa":false,"landing_page_url":"https://doi.org/10.1109/escience.2019.00020","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 15th International Conference on eScience (eScience)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027673766","display_name":"Icaro Alzuru","orcid":null},"institutions":[{"id":"https://openalex.org/I33213144","display_name":"University of Florida","ror":"https://ror.org/02y3ad647","country_code":"US","type":"education","lineage":["https://openalex.org/I33213144"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Icaro Alzuru","raw_affiliation_strings":["University of Florida","CISE Department, University of Florida, Gainesville, US"],"affiliations":[{"raw_affiliation_string":"University of Florida","institution_ids":["https://openalex.org/I33213144"]},{"raw_affiliation_string":"CISE Department, University of Florida, Gainesville, US","institution_ids":["https://openalex.org/I33213144"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087880697","display_name":"Rhiannon Stephens","orcid":null},"institutions":[{"id":"https://openalex.org/I1328215151","display_name":"Australian Museum","ror":"https://ror.org/02zv4ka60","country_code":"AU","type":"archive","lineage":["https://openalex.org/I1328215151"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Rhiannon Stephens","raw_affiliation_strings":["Australian Museum","Research Institute Australian Museum, Sydney, Australia"],"affiliations":[{"raw_affiliation_string":"Australian Museum","institution_ids":["https://openalex.org/I1328215151"]},{"raw_affiliation_string":"Research Institute Australian Museum, Sydney, Australia","institution_ids":["https://openalex.org/I1328215151"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003262238","display_name":"Andr\u00e9a Matsunaga","orcid":"https://orcid.org/0000-0001-9036-5895"},"institutions":[{"id":"https://openalex.org/I33213144","display_name":"University of Florida","ror":"https://ror.org/02y3ad647","country_code":"US","type":"education","lineage":["https://openalex.org/I33213144"]},{"id":"https://openalex.org/I4210101698","display_name":"Advanced Systems Laboratory","ror":"https://ror.org/0179ktw44","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1340206300","https://openalex.org/I4210101698","https://openalex.org/I4210150591"]}],"countries":["IN","US"],"is_corresponding":false,"raw_author_name":"Andr\u00e9a Matsunaga","raw_affiliation_strings":["Advanced Computing and Information Systems Laboratory","ACIS Lab., University of Florida, Gainesville, FL, USA"],"affiliations":[{"raw_affiliation_string":"Advanced Computing and Information Systems Laboratory","institution_ids":["https://openalex.org/I4210101698"]},{"raw_affiliation_string":"ACIS Lab., University of Florida, Gainesville, FL, USA","institution_ids":["https://openalex.org/I33213144"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5086302230","display_name":"Maur\u00edcio Tsugawa","orcid":null},"institutions":[{"id":"https://openalex.org/I33213144","display_name":"University of Florida","ror":"https://ror.org/02y3ad647","country_code":"US","type":"education","lineage":["https://openalex.org/I33213144"]},{"id":"https://openalex.org/I4210101698","display_name":"Advanced Systems Laboratory","ror":"https://ror.org/0179ktw44","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1340206300","https://openalex.org/I4210101698","https://openalex.org/I4210150591"]}],"countries":["IN","US"],"is_corresponding":false,"raw_author_name":"Maur\u00edcio Tsugawa","raw_affiliation_strings":["Advanced Computing and Information Systems Laboratory","ACIS Lab., University of Florida, Gainesville, FL, USA"],"affiliations":[{"raw_affiliation_string":"Advanced Computing and Information Systems Laboratory","institution_ids":["https://openalex.org/I4210101698"]},{"raw_affiliation_string":"ACIS Lab., University of Florida, Gainesville, FL, USA","institution_ids":["https://openalex.org/I33213144"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090493717","display_name":"Paul Flemons","orcid":"https://orcid.org/0000-0002-8261-1942"},"institutions":[{"id":"https://openalex.org/I1328215151","display_name":"Australian Museum","ror":"https://ror.org/02zv4ka60","country_code":"AU","type":"archive","lineage":["https://openalex.org/I1328215151"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Paul Flemons","raw_affiliation_strings":["Australian Museum","Research Institute Australian Museum, Sydney, Australia"],"affiliations":[{"raw_affiliation_string":"Australian Museum","institution_ids":["https://openalex.org/I1328215151"]},{"raw_affiliation_string":"Research Institute Australian Museum, Sydney, Australia","institution_ids":["https://openalex.org/I1328215151"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5003181544","display_name":"J.A.B. Fortes","orcid":"https://orcid.org/0000-0001-8870-5205"},"institutions":[{"id":"https://openalex.org/I33213144","display_name":"University of Florida","ror":"https://ror.org/02y3ad647","country_code":"US","type":"education","lineage":["https://openalex.org/I33213144"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jos\u00e9 A.B. Fortes","raw_affiliation_strings":["University of Florida","CISE Department, University of Florida, Gainesville, FL, USA"],"affiliations":[{"raw_affiliation_string":"University of Florida","institution_ids":["https://openalex.org/I33213144"]},{"raw_affiliation_string":"CISE Department, University of Florida, Gainesville, FL, USA","institution_ids":["https://openalex.org/I33213144"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5027673766"],"corresponding_institution_ids":["https://openalex.org/I33213144"],"apc_list":null,"apc_paid":null,"fwci":0.1012,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.48391477,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"116","last_page":"125"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9905999898910522,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.9868999719619751,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9847999811172485,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/crowdsourcing","display_name":"Crowdsourcing","score":0.9531091451644897},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7748996019363403},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.603668749332428},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.5095170140266418},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.49187564849853516},{"id":"https://openalex.org/keywords/optical-character-recognition","display_name":"Optical character recognition","score":0.48011088371276855},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.43205732107162476},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4270256757736206},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.32121628522872925},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.1008000373840332}],"concepts":[{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.9531091451644897},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7748996019363403},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.603668749332428},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.5095170140266418},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.49187564849853516},{"id":"https://openalex.org/C546480517","wikidata":"https://www.wikidata.org/wiki/Q167555","display_name":"Optical character recognition","level":3,"score":0.48011088371276855},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.43205732107162476},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4270256757736206},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.32121628522872925},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.1008000373840332},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/escience.2019.00020","is_oa":false,"landing_page_url":"https://doi.org/10.1109/escience.2019.00020","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 15th International Conference on eScience (eScience)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":27,"referenced_works":["https://openalex.org/W1972939083","https://openalex.org/W1993557076","https://openalex.org/W2001642682","https://openalex.org/W2038329042","https://openalex.org/W2060580591","https://openalex.org/W2119179626","https://openalex.org/W2140623652","https://openalex.org/W2171313960","https://openalex.org/W2212003014","https://openalex.org/W2308839768","https://openalex.org/W2327209116","https://openalex.org/W2481930807","https://openalex.org/W2559800592","https://openalex.org/W2614092708","https://openalex.org/W2769563307","https://openalex.org/W2785875661","https://openalex.org/W2800408407","https://openalex.org/W2892857623","https://openalex.org/W2895456681","https://openalex.org/W2904601236","https://openalex.org/W2963360699","https://openalex.org/W2964332134","https://openalex.org/W3003915459","https://openalex.org/W4289422041","https://openalex.org/W6721872216","https://openalex.org/W6730480800","https://openalex.org/W6755284406"],"related_works":["https://openalex.org/W3032998312","https://openalex.org/W135177976","https://openalex.org/W4384486036","https://openalex.org/W1503094549","https://openalex.org/W2337920774","https://openalex.org/W4286908577","https://openalex.org/W2886410948","https://openalex.org/W2025875869","https://openalex.org/W4318823662","https://openalex.org/W3207526114"],"abstract_inverted_index":{"Information":[0],"Extraction":[1],"(IE)":[2],"from":[3],"imaged":[4],"text":[5,19,88,218],"is":[6,32,46,150,165],"affected":[7],"by":[8,64,192,199],"the":[9,13,33,47,54,76,84,87,107,115,129,158,161,168,173,187,200],"output":[10,159],"quality":[11],"of":[12,49,78,86,97,124,142,146,160,163,172,189,202,224],"text-recognition":[14],"process.":[15,176],"Misspelled":[16],"or":[17,23],"missing":[18],"may":[20],"propagate":[21],"errors":[22],"even":[24],"preclude":[25],"IE.":[26],"Low":[27],"confidence":[28,140],"in":[29,59,90,181],"automated":[30],"methods":[31],"reason":[34],"why":[35],"some":[36],"IE":[37],"projects":[38],"rely":[39],"exclusively":[40],"on":[41],"human":[42],"work":[43],"(crowdsourcing).":[44],"That":[45],"case":[48],"biological":[50],"collections":[51],"(biocollections),":[52],"where":[53,157],"metadata":[55],"(Darwin-core":[56],"Terms)":[57],"found":[58,89],"digitized":[60],"labels":[61],"are":[62],"transcribed":[63],"citizen":[65],"scientists.":[66],"In":[67],"this":[68],"paper,":[69],"we":[70],"present":[71],"an":[72,95],"approach":[73,113,178],"to":[74,82,134,148,196,208],"reduce":[75],"number":[77,145,188],"crowdsourcing":[79,156,175,190],"tasks":[80,191],"required":[81],"obtain":[83],"transcription":[85,133,171],"biocollections'":[91],"images.":[92],"By":[93],"using":[94,212],"ensemble":[96,162,201],"Optical":[98],"Character":[99],"Recognition":[100],"(OCR)":[101],"engines":[102],"-":[103,111],"OCRopus,":[104],"Tesseract,":[105],"and":[106,117,204],"Google":[108],"Cloud":[109],"OCR":[110],"our":[112],"identifies":[114],"lines":[116,147,197],"characters":[118],"that":[119],"have":[120],"a":[121,220],"high":[122],"probability":[123],"being":[125],"correct.":[126],"This":[127],"reduces":[128],"need":[130],"for":[131,137],"crowdsourced":[132],"be":[135],"done":[136],"only":[138],"low":[139],"fragments":[141],"text.":[143],"The":[144,215],"transcribe":[149],"also":[151],"reduced":[152],"through":[153],"hybrid":[154,213],"human-machine":[155],"OCRs":[164,203],"used":[166],"as":[167],"first":[169],"\"human\"":[170],"redundant":[174],"Our":[177],"was":[179],"tested":[180],"six":[182],"biocollections":[183],"(2,966":[184],"images),":[185],"reducing":[186],"76%":[193],"(58%":[194],"due":[195,207],"accepted":[198],"about":[205],"18%":[206],"accelerated":[209],"convergence":[210],"when":[211],"crowdsourcing).":[214],"automatically":[216],"extracted":[217],"presented":[219],"character":[221],"error":[222],"rate":[223],"0.001":[225],"(0.1%).":[226]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
