{"id":"https://openalex.org/W2762763087","doi":"https://doi.org/10.1145/3125643","title":"Estimating the Unseen","display_name":"Estimating the Unseen","publication_year":2017,"publication_date":"2017-10-04","ids":{"openalex":"https://openalex.org/W2762763087","doi":"https://doi.org/10.1145/3125643","mag":"2762763087"},"language":"en","primary_location":{"id":"doi:10.1145/3125643","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3125643","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3125643","source":{"id":"https://openalex.org/S118992489","display_name":"Journal of the ACM","issn_l":"0004-5411","issn":["0004-5411","1557-735X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the ACM","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3125643","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5079503799","display_name":"Gregory Valiant","orcid":"https://orcid.org/0000-0002-2211-1073"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Gregory Valiant","raw_affiliation_strings":["Stanford University, Stanford, CA"],"affiliations":[{"raw_affiliation_string":"Stanford University, Stanford, CA","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5036230157","display_name":"Paul Valiant","orcid":"https://orcid.org/0000-0001-5523-2974"},"institutions":[{"id":"https://openalex.org/I27804330","display_name":"Brown University","ror":"https://ror.org/05gq02987","country_code":"US","type":"education","lineage":["https://openalex.org/I27804330"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Paul Valiant","raw_affiliation_strings":["Brown University, Providence, RI"],"affiliations":[{"raw_affiliation_string":"Brown University, Providence, RI","institution_ids":["https://openalex.org/I27804330"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5079503799"],"corresponding_institution_ids":["https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":3.1181,"has_fulltext":true,"cited_by_count":59,"citation_normalized_percentile":{"value":0.93470431,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":"64","issue":"6","first_page":"1","last_page":"41"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.995199978351593,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11447","display_name":"Blind Source Separation Techniques","score":0.9894000291824341,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.6488032341003418},{"id":"https://openalex.org/keywords/logarithm","display_name":"Logarithm","score":0.5601029992103577},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.5231224894523621},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5195522904396057},{"id":"https://openalex.org/keywords/sublinear-function","display_name":"Sublinear function","score":0.4965851902961731},{"id":"https://openalex.org/keywords/sample-size-determination","display_name":"Sample size determination","score":0.4932548403739929},{"id":"https://openalex.org/keywords/entropy","display_name":"Entropy (arrow of time)","score":0.4747388958930969},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.4717855155467987},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.44532105326652527},{"id":"https://openalex.org/keywords/probability-distribution","display_name":"Probability distribution","score":0.4392387270927429},{"id":"https://openalex.org/keywords/constant","display_name":"Constant (computer programming)","score":0.4392277002334595},{"id":"https://openalex.org/keywords/variety","display_name":"Variety (cybernetics)","score":0.41310855746269226},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.3464096188545227},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3413352370262146},{"id":"https://openalex.org/keywords/discrete-mathematics","display_name":"Discrete mathematics","score":0.08932262659072876}],"concepts":[{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.6488032341003418},{"id":"https://openalex.org/C39927690","wikidata":"https://www.wikidata.org/wiki/Q11197","display_name":"Logarithm","level":2,"score":0.5601029992103577},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.5231224894523621},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5195522904396057},{"id":"https://openalex.org/C117160843","wikidata":"https://www.wikidata.org/wiki/Q338652","display_name":"Sublinear function","level":2,"score":0.4965851902961731},{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.4932548403739929},{"id":"https://openalex.org/C106301342","wikidata":"https://www.wikidata.org/wiki/Q4117933","display_name":"Entropy (arrow of time)","level":2,"score":0.4747388958930969},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.4717855155467987},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.44532105326652527},{"id":"https://openalex.org/C149441793","wikidata":"https://www.wikidata.org/wiki/Q200726","display_name":"Probability distribution","level":2,"score":0.4392387270927429},{"id":"https://openalex.org/C2777027219","wikidata":"https://www.wikidata.org/wiki/Q1284190","display_name":"Constant (computer programming)","level":2,"score":0.4392277002334595},{"id":"https://openalex.org/C136197465","wikidata":"https://www.wikidata.org/wiki/Q1729295","display_name":"Variety (cybernetics)","level":2,"score":0.41310855746269226},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.3464096188545227},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3413352370262146},{"id":"https://openalex.org/C118615104","wikidata":"https://www.wikidata.org/wiki/Q121416","display_name":"Discrete mathematics","level":1,"score":0.08932262659072876},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3125643","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3125643","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3125643","source":{"id":"https://openalex.org/S118992489","display_name":"Journal of the ACM","issn_l":"0004-5411","issn":["0004-5411","1557-735X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the ACM","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3125643","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3125643","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3125643","source":{"id":"https://openalex.org/S118992489","display_name":"Journal of the ACM","issn_l":"0004-5411","issn":["0004-5411","1557-735X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the ACM","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G2723285636","display_name":"III: Medium: Quantifying the Unknown Unknowns for Data Integration","funder_award_id":"1562657","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G3882767772","display_name":null,"funder_award_id":"IIS-1562657","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6541910588","display_name":"CAREER: Algorithms for understanding data","funder_award_id":"1351108","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6671297155","display_name":null,"funder_award_id":"CAREER","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G6851672665","display_name":null,"funder_award_id":"CCF-1351108","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2762763087.pdf","grobid_xml":"https://content.openalex.org/works/W2762763087.grobid-xml"},"referenced_works_count":53,"referenced_works":["https://openalex.org/W1538444754","https://openalex.org/W1585701772","https://openalex.org/W1595409123","https://openalex.org/W1595687138","https://openalex.org/W1654945559","https://openalex.org/W1873671735","https://openalex.org/W1947089506","https://openalex.org/W1971405816","https://openalex.org/W1980179247","https://openalex.org/W1982516282","https://openalex.org/W1982918157","https://openalex.org/W1987754412","https://openalex.org/W1988624553","https://openalex.org/W1989151402","https://openalex.org/W1992068214","https://openalex.org/W2001947543","https://openalex.org/W2002001845","https://openalex.org/W2022257958","https://openalex.org/W2045740840","https://openalex.org/W2058991275","https://openalex.org/W2066257886","https://openalex.org/W2069241007","https://openalex.org/W2073479529","https://openalex.org/W2076381458","https://openalex.org/W2078764670","https://openalex.org/W2079473986","https://openalex.org/W2082092506","https://openalex.org/W2094608047","https://openalex.org/W2095306947","https://openalex.org/W2097580994","https://openalex.org/W2101985079","https://openalex.org/W2114771311","https://openalex.org/W2124055802","https://openalex.org/W2127090196","https://openalex.org/W2134169350","https://openalex.org/W2135247172","https://openalex.org/W2135827220","https://openalex.org/W2146368895","https://openalex.org/W2151835418","https://openalex.org/W2159784709","https://openalex.org/W2187447858","https://openalex.org/W2411951720","https://openalex.org/W2419099043","https://openalex.org/W2510474575","https://openalex.org/W2545606300","https://openalex.org/W2554091414","https://openalex.org/W2899702797","https://openalex.org/W2949196898","https://openalex.org/W2963608890","https://openalex.org/W4233471163","https://openalex.org/W4255005259","https://openalex.org/W4256422854","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W90906771","https://openalex.org/W2018828772","https://openalex.org/W2529185025","https://openalex.org/W2052708136","https://openalex.org/W2809723425","https://openalex.org/W2005302727","https://openalex.org/W4289097813","https://openalex.org/W3082028334","https://openalex.org/W1973725449","https://openalex.org/W4310831791"],"abstract_inverted_index":{"We":[0,178],"show":[1],"that":[2,91,221],"a":[3,34,40,62,100,106,112,149,198,216,228],"class":[4],"of":[5,8,20,28,43,64,102,108,115,134,140,171,175,184,188],"statistical":[6,235],"properties":[7,15,56],"distributions,":[9,29,110],"which":[10,165],"includes":[11],"such":[12,157],"practically":[13],"relevant":[14],"as":[16,143,145,159,196,227],"entropy,":[17],"the":[18,127,131,135,141,160,167,172,176,182,185,189,212],"number":[19],"distinct":[21,53],"elements,":[22,54],"and":[23,201,234],"distance":[24],"metrics":[25],"between":[26],"pairs":[27],"can":[30,57,193],"be":[31,58,194,224],"estimated":[32,59],"given":[33,39],"sublinear":[35],"sized":[36],"sample.":[37,153],"Specifically,":[38],"sample":[41,63,128,213],"consisting":[42],"independent":[44],"draws":[45],"from":[46],"any":[47],"distribution":[48,142],"over":[49],"at":[50],"most":[51],"k":[52,68,70],"these":[55,73,85],"accurately":[60,144],"using":[61],"size":[65,214],"O":[66],"(":[67],"log":[69],").":[71],"For":[72],"estimation":[74,103,163],"tasks,":[75,104],"this":[76,138],"performance":[77],"is":[78,123],"optimal":[79],",":[80],"to":[81,124,129,180],"constant":[82],"factors.":[83],"Complementing":[84],"theoretical":[86],"results,":[87],"we":[88,219],"also":[89],"demonstrate":[90],"our":[92,121],"estimators":[93],"perform":[94],"exceptionally":[95],"well,":[96],"in":[97,120],"practice,":[98],"for":[99,111,206],"variety":[101,107],"on":[105],"natural":[109],"wide":[113],"range":[114],"parameters.":[116],"The":[117],"key":[118],"step":[119],"approach":[122],"first":[125],"use":[126],"characterize":[130],"\u201cunseen\u201d":[132],"portion":[133,139,174,187],"distribution\u2014effectively":[136],"reconstructing":[137],"if":[146],"one":[147],"had":[148],"logarithmic":[150,217],"factor":[151],"larger":[152,231],"This":[154,191],"goes":[155],"beyond":[156],"tools":[158],"Good-Turing":[161],"frequency":[162],"scheme,":[164],"estimates":[166],"total":[168],"probability":[169],"mass":[170],"unobserved":[173,186],"distribution:":[177],"seek":[179],"estimate":[181],"shape":[183],"distribution.":[190],"work":[192],"seen":[195],"introducing":[197],"robust,":[199],"general,":[200],"theoretically":[202],"principled":[203],"framework":[204],"that,":[205],"many":[207],"practical":[208],"applications,":[209],"essentially":[210],"amplifies":[211],"by":[215],"factor;":[218],"expect":[220],"it":[222],"may":[223],"fruitfully":[225],"used":[226],"component":[229],"within":[230],"machine":[232],"learning":[233],"analysis":[236],"systems.":[237]},"counts_by_year":[{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":11},{"year":2023,"cited_by_count":8},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":5},{"year":2019,"cited_by_count":8},{"year":2018,"cited_by_count":2},{"year":2016,"cited_by_count":2}],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
