{"id":"https://openalex.org/W2076531274","doi":"https://doi.org/10.1145/1815330.1815386","title":"Safely selecting subsets of training data","display_name":"Safely selecting subsets of training data","publication_year":2010,"publication_date":"2010-06-09","ids":{"openalex":"https://openalex.org/W2076531274","doi":"https://doi.org/10.1145/1815330.1815386","mag":"2076531274"},"language":"en","primary_location":{"id":"doi:10.1145/1815330.1815386","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1815330.1815386","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 9th IAPR International Workshop on Document Analysis Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054482111","display_name":"Dawei Yin","orcid":"https://orcid.org/0000-0002-8846-2001"},"institutions":[{"id":"https://openalex.org/I186143895","display_name":"Lehigh University","ror":"https://ror.org/012afjb06","country_code":"US","type":"education","lineage":["https://openalex.org/I186143895"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Dawei Yin","raw_affiliation_strings":["Lehigh University, Bethlehem, PA"],"affiliations":[{"raw_affiliation_string":"Lehigh University, Bethlehem, PA","institution_ids":["https://openalex.org/I186143895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101884853","display_name":"An Chang","orcid":"https://orcid.org/0000-0001-5179-5833"},"institutions":[{"id":"https://openalex.org/I186143895","display_name":"Lehigh University","ror":"https://ror.org/012afjb06","country_code":"US","type":"education","lineage":["https://openalex.org/I186143895"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chang An","raw_affiliation_strings":["Lehigh University, Bethlehem, PA"],"affiliations":[{"raw_affiliation_string":"Lehigh University, Bethlehem, PA","institution_ids":["https://openalex.org/I186143895"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011850885","display_name":"Henry S. Baird","orcid":null},"institutions":[{"id":"https://openalex.org/I186143895","display_name":"Lehigh University","ror":"https://ror.org/012afjb06","country_code":"US","type":"education","lineage":["https://openalex.org/I186143895"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Henry S. Baird","raw_affiliation_strings":["Lehigh University, Bethlehem, PA"],"affiliations":[{"raw_affiliation_string":"Lehigh University, Bethlehem, PA","institution_ids":["https://openalex.org/I186143895"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5054482111"],"corresponding_institution_ids":["https://openalex.org/I186143895"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15513107,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"433","last_page":"440"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8377313613891602},{"id":"https://openalex.org/keywords/oracle","display_name":"Oracle","score":0.774405837059021},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6618814468383789},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.642920196056366},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.5560191869735718},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.5261903405189514},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5000932216644287},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4643775522708893},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.4629557728767395},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.4610065817832947},{"id":"https://openalex.org/keywords/test-set","display_name":"Test set","score":0.43812164664268494},{"id":"https://openalex.org/keywords/contextual-image-classification","display_name":"Contextual image classification","score":0.43678462505340576},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4359979033470154},{"id":"https://openalex.org/keywords/k-nearest-neighbors-algorithm","display_name":"k-nearest neighbors algorithm","score":0.4130907356739044},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.35956352949142456}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8377313613891602},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.774405837059021},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6618814468383789},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.642920196056366},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.5560191869735718},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.5261903405189514},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5000932216644287},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4643775522708893},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.4629557728767395},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4610065817832947},{"id":"https://openalex.org/C169903167","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Test set","level":2,"score":0.43812164664268494},{"id":"https://openalex.org/C75294576","wikidata":"https://www.wikidata.org/wiki/Q5165192","display_name":"Contextual image classification","level":3,"score":0.43678462505340576},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4359979033470154},{"id":"https://openalex.org/C113238511","wikidata":"https://www.wikidata.org/wiki/Q1071612","display_name":"k-nearest neighbors algorithm","level":2,"score":0.4130907356739044},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.35956352949142456},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/1815330.1815386","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1815330.1815386","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 9th IAPR International Workshop on Document Analysis Systems","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W195783868","https://openalex.org/W1490760466","https://openalex.org/W1502916507","https://openalex.org/W1526692684","https://openalex.org/W1970319631","https://openalex.org/W1998347547","https://openalex.org/W2024668293","https://openalex.org/W2061845601","https://openalex.org/W2071866949","https://openalex.org/W2109044256","https://openalex.org/W2135637682","https://openalex.org/W2146221113","https://openalex.org/W2147717514","https://openalex.org/W2162006472","https://openalex.org/W2165558283","https://openalex.org/W2427881153"],"related_works":["https://openalex.org/W1999699871","https://openalex.org/W4225124612","https://openalex.org/W2043806667","https://openalex.org/W2021633306","https://openalex.org/W2006801911","https://openalex.org/W2033669961","https://openalex.org/W2971899271","https://openalex.org/W1972167985","https://openalex.org/W2350644419","https://openalex.org/W3165517033"],"abstract_inverted_index":{"Highly":[0],"versatile":[1],"classifiers":[2,56,64],"for":[3,105,130],"document":[4,95,114],"analysis":[5],"systems":[6],"demand":[7],"representative":[8],"training":[9,30,103,126],"sets":[10],"which":[11],"can":[12,86],"be":[13,87,155],"dauntingly":[14],"large,":[15],"often":[16],"challenging":[17],"conventional":[18],"trainable":[19],"classifier":[20],"technologies.":[21],"We":[22],"propose":[23],"to":[24,33,89,123,128,157],"select":[25,98],"a":[26,70,113,131,134],"small":[27],"subset":[28],"of":[29,40,45,81,133,136,141],"data,":[31],"matched":[32],"each":[34],"particular":[35],"test":[36,108],"set,":[37],"in":[38,83],"hopes":[39],"improved":[41],"speed":[42],"without":[43],"loss":[44,140],"accuracy.":[46,142],"Since":[47],"selection":[48,150],"must":[49],"occur":[50],"on":[51,112],"line,":[52],"we":[53,68,97],"cannot":[54],"use":[55,69],"that":[57,152],"require":[58],"off-line":[59],"training.":[60],"Fortunately,":[61],"Nearest":[62],"Neighbors":[63],"support":[65],"on-line":[66],"training;":[67],"fast":[71],"approximate":[72],"kNN":[73],"technology":[74],"using":[75],"hashed":[76],"k-D":[77,84],"trees.":[78],"The":[79],"distribution":[80],"samples":[82],"bins":[85],"used":[88],"measure":[90],"similarity":[91],"between":[92],"any":[93,106],"two":[94],"images:":[96],"the":[99],"three":[100],"most":[101],"similar":[102],"images":[104,127],"given":[107],"image.":[109],"In":[110],"experiments":[111,144],"image":[115],"content":[116],"extraction":[117],"system,":[118],"our":[119],"algorithm":[120],"was":[121],"able":[122],"prune":[124],"118":[125],"three,":[129],"speedup":[132],"factor":[135],"17":[137],"with":[138,145],"no":[139],"Other":[143],"an":[146],"oracle":[147],"and":[148],"manual":[149],"suggest":[151],"it":[153],"may":[154],"possible":[156],"improve":[158],"accuracy":[159],"as":[160],"well.":[161]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
