{"id":"https://openalex.org/W2041912938","doi":"https://doi.org/10.1145/1007568.1007601","title":"A bi-level Bernoulli scheme for database sampling","display_name":"A bi-level Bernoulli scheme for database sampling","publication_year":2004,"publication_date":"2004-06-13","ids":{"openalex":"https://openalex.org/W2041912938","doi":"https://doi.org/10.1145/1007568.1007601","mag":"2041912938"},"language":"en","primary_location":{"id":"doi:10.1145/1007568.1007601","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1007568.1007601","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2004 ACM SIGMOD international conference on Management of data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090729930","display_name":"Peter J. Haas","orcid":"https://orcid.org/0000-0001-5694-3065"},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]},{"id":"https://openalex.org/I4210085935","display_name":"IBM Research - Almaden","ror":"https://ror.org/005w8dd04","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210085935","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Peter J. Haas","raw_affiliation_strings":["IBM Almaden Research Center","IBM Almaden Research Center,#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"IBM Almaden Research Center","institution_ids":["https://openalex.org/I4210085935"]},{"raw_affiliation_string":"IBM Almaden Research Center,#TAB#","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053657322","display_name":"Christian K\u00f6nig","orcid":"https://orcid.org/0000-0003-0585-5983"},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]},{"id":"https://openalex.org/I4210102616","display_name":"Merz Akademie","ror":"https://ror.org/0167r2g29","country_code":"DE","type":"education","lineage":["https://openalex.org/I4210102616"]}],"countries":["DE","US"],"is_corresponding":false,"raw_author_name":"Christian K\u00f6nig","raw_affiliation_strings":["Berufsakademie Stuttgart/IBM Germany","Berufsakademie Stuttgart/IBM Germany#TAB#"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Berufsakademie Stuttgart/IBM Germany","institution_ids":["https://openalex.org/I4210102616"]},{"raw_affiliation_string":"Berufsakademie Stuttgart/IBM Germany#TAB#","institution_ids":["https://openalex.org/I1341412227"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":5.24,"has_fulltext":false,"cited_by_count":78,"citation_normalized_percentile":{"value":0.95792624,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"275","last_page":"286"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12761","display_name":"Data Stream Mining Techniques","score":0.9842000007629395,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.7794725298881531},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.725698709487915},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.5125886797904968},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.4767911434173584},{"id":"https://openalex.org/keywords/stratified-sampling","display_name":"Stratified sampling","score":0.47361111640930176},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.42919665575027466},{"id":"https://openalex.org/keywords/database","display_name":"Database","score":0.38829588890075684},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3766986131668091},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.3252245783805847},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.20120248198509216},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.12147855758666992}],"concepts":[{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.7794725298881531},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.725698709487915},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.5125886797904968},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.4767911434173584},{"id":"https://openalex.org/C49898467","wikidata":"https://www.wikidata.org/wiki/Q1517706","display_name":"Stratified sampling","level":2,"score":0.47361111640930176},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.42919665575027466},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.38829588890075684},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3766986131668091},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.3252245783805847},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.20120248198509216},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.12147855758666992},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/1007568.1007601","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1007568.1007601","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2004 ACM SIGMOD international conference on Management of data","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.130.5947","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.130.5947","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.uiuc.edu/class/fa05/cs591han/sigmodpods04/sigmod/pdf/r-148.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Partnerships for the goals","id":"https://metadata.un.org/sdg/17","score":0.4099999964237213}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W1592355944","https://openalex.org/W1964857063","https://openalex.org/W1988144572","https://openalex.org/W2020584928","https://openalex.org/W2029685080","https://openalex.org/W2062506051","https://openalex.org/W2090403603","https://openalex.org/W2152828142","https://openalex.org/W2169982181","https://openalex.org/W2296677182","https://openalex.org/W2799250126","https://openalex.org/W4237172715","https://openalex.org/W4241185933","https://openalex.org/W6635371082"],"related_works":["https://openalex.org/W2387471420","https://openalex.org/W3042964584","https://openalex.org/W2356880469","https://openalex.org/W2383809451","https://openalex.org/W4206840145","https://openalex.org/W2370014976","https://openalex.org/W2350399852","https://openalex.org/W2378407977","https://openalex.org/W3047864323","https://openalex.org/W2075598034"],"abstract_inverted_index":{"Current":[0],"database":[1,201],"sampling":[2,12,23,31,76,88,101,114,136,152,193,212,218,260,284],"methods":[3,32],"give":[4,205],"the":[5,27,41,44,46,69,79,134,140,159,176,179,211,214,231,248,256,272,290,295],"user":[6,47],"insufficient":[7],"control":[8],"when":[9,104,118],"processing":[10,52],"ISO-style":[11],"queries.":[13],"To":[14],"address":[15],"this":[16],"problem,":[17],"we":[18,128,143,203],"provide":[19],"a":[20,63,122,145,188,206,221,263,280],"bi-level":[21,75],"Bernoulli":[22],"scheme":[24],"that":[25,130,150,227,247],"combines":[26],"row-level":[28,100],"and":[29,54,77,165,178,181,242,258,286],"page-level":[30,113],"currently":[33],"used":[34,117],"in":[35,131,199,230,289],"most":[36,291],"commercial":[37,200],"systems.":[38],"By":[39],"adjusting":[40],"parameters":[42],"of":[43,59,87,139,187,224,255,265,267,271,283],"method,":[45],"can":[48,182,194],"systematically":[49],"trade":[50],"off":[51],"speed":[53],"statistical":[55],"precision---the":[56],"appropriate":[57],"choice":[58],"parameter":[60,81],"settings":[61,82],"becomes":[62],"query":[64,177],"optimization":[65],"problem.":[66],"We":[67],"indicate":[68],"SQL":[70],"extensions":[71],"needed":[72],"to":[73,197],"support":[74],"determine":[78],"optimal":[80,135,151,268],"for":[83,209],"an":[84],"important":[85],"class":[86],"queries":[89],"with":[90],"explicit":[91],"time":[92],"or":[93,300],"accuracy":[94],"constraints.":[95],"As":[96],"might":[97],"be":[98,116,183,195],"expected,":[99],"is":[102,138,153,161,277,297],"preferable":[103],"data":[105,119,244,296],"values":[106,120],"on":[107,121,239],"each":[108],"page":[109,123],"are":[110,228],"homogeneous,":[111],"whereas":[112],"should":[115],"vary":[124],"widely.":[125],"Perhaps":[126],"surprisingly,":[127],"show":[129,246],"many":[132],"cases":[133],"policy":[137],"\"bang-bang\"":[141],"type:":[142],"identify":[144],"\"page-heterogeneity":[146],"index\"":[147],"(PHI)":[148],"such":[149],"as":[154,156,166,168],"\"row-like\"":[155],"possible":[157,169],"if":[158],"PHI":[160,172],"less":[162],"than":[163],"1":[164],"\"page-like\"":[167],"otherwise.":[170],"The":[171,274],"depends":[173],"upon":[174],"both":[175],"data,":[180],"estimated":[184],"by":[185,219],"means":[186],"pilot":[189,192,217],"sample.":[190],"Because":[191],"nontrivial":[196],"implement":[198],"systems,":[202],"also":[204],"heuristic":[207,249,275],"method":[208,215,250,276],"setting":[210],"parameters;":[213],"avoids":[216],"using":[220],"small":[222],"number":[223],"summary":[225],"statistics":[226],"maintained":[229],"system":[232],"catalog.":[233],"Results":[234],"from":[235],"over":[236,279],"1100":[237],"experiments":[238],"372":[240],"real":[241],"synthetic":[243],"sets":[245],"performs":[251,287],"optimally":[252],"about":[253,269],"half":[254],"time,":[257],"yields":[259],"errors":[261],"within":[262],"factor":[264],"2.2":[266],"93%":[270],"time.":[273],"stable":[278],"wide":[281],"range":[282],"rates":[285],"best":[288],"critical":[292],"cases,":[293],"where":[294],"highly":[298],"clustered":[299],"skewed.":[301]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2020,"cited_by_count":6},{"year":2019,"cited_by_count":6},{"year":2018,"cited_by_count":1},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":4},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":5},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":5}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
