{"id":"https://openalex.org/W2106592976","doi":"https://doi.org/10.1145/1526709.1526866","title":"Threshold selection for web-page classification with highly skewed class distribution","display_name":"Threshold selection for web-page classification with highly skewed class distribution","publication_year":2009,"publication_date":"2009-04-20","ids":{"openalex":"https://openalex.org/W2106592976","doi":"https://doi.org/10.1145/1526709.1526866","mag":"2106592976"},"language":"en","primary_location":{"id":"doi:10.1145/1526709.1526866","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1526709.1526866","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th international conference on World wide web","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101434435","display_name":"Xiaofeng He","orcid":"https://orcid.org/0000-0002-6911-348X"},"institutions":[{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":true,"raw_author_name":"Xiaofeng He","raw_affiliation_strings":["Yahoo! Inc., Santa Clara, CA, USA","[Yahoo, Inc., Santa Clara, CA, USA]"],"affiliations":[{"raw_affiliation_string":"Yahoo! Inc., Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"[Yahoo, Inc., Santa Clara, CA, USA]","institution_ids":["https://openalex.org/I1325784139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103153652","display_name":"Lei Duan","orcid":"https://orcid.org/0000-0001-7254-1832"},"institutions":[{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Lei Duan","raw_affiliation_strings":["Yahoo! Inc., Santa Clara, CA, USA","[Yahoo, Inc., Santa Clara, CA, USA]"],"affiliations":[{"raw_affiliation_string":"Yahoo! Inc., Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"[Yahoo, Inc., Santa Clara, CA, USA]","institution_ids":["https://openalex.org/I1325784139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026132447","display_name":"Yiping Zhou","orcid":"https://orcid.org/0000-0002-0486-758X"},"institutions":[{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Yiping Zhou","raw_affiliation_strings":["Yahoo! Inc., Santa Clara, CA, USA","[Yahoo, Inc., Santa Clara, CA, USA]"],"affiliations":[{"raw_affiliation_string":"Yahoo! Inc., Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"[Yahoo, Inc., Santa Clara, CA, USA]","institution_ids":["https://openalex.org/I1325784139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5031458509","display_name":"Byron Dom","orcid":null},"institutions":[{"id":"https://openalex.org/I4210134091","display_name":"Yahoo (United States)","ror":"https://ror.org/040dkzz12","country_code":"US","type":"company","lineage":["https://openalex.org/I4210134091"]},{"id":"https://openalex.org/I1325784139","display_name":"Yahoo (United Kingdom)","ror":"https://ror.org/038p3gq39","country_code":"GB","type":"company","lineage":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"]}],"countries":["GB","US"],"is_corresponding":false,"raw_author_name":"Byron Dom","raw_affiliation_strings":["Yahoo! Inc., Santa Clara, CA, USA","[Yahoo, Inc., Santa Clara, CA, USA]"],"affiliations":[{"raw_affiliation_string":"Yahoo! Inc., Santa Clara, CA, USA","institution_ids":["https://openalex.org/I4210134091"]},{"raw_affiliation_string":"[Yahoo, Inc., Santa Clara, CA, USA]","institution_ids":["https://openalex.org/I1325784139"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5101434435"],"corresponding_institution_ids":["https://openalex.org/I1325784139","https://openalex.org/I4210134091"],"apc_list":null,"apc_paid":null,"fwci":1.3549,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.85018774,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1081","last_page":"1082"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11644","display_name":"Spam and Phishing Detection","score":0.991100013256073,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6302649974822998},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.6101363897323608},{"id":"https://openalex.org/keywords/binary-classification","display_name":"Binary classification","score":0.5374079346656799},{"id":"https://openalex.org/keywords/sample","display_name":"Sample (material)","score":0.5294777154922485},{"id":"https://openalex.org/keywords/variance","display_name":"Variance (accounting)","score":0.5268912315368652},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5165156722068787},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.5086160898208618},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5029410719871521},{"id":"https://openalex.org/keywords/stratified-sampling","display_name":"Stratified sampling","score":0.47446209192276},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.4734484851360321},{"id":"https://openalex.org/keywords/sample-size-determination","display_name":"Sample size determination","score":0.4658281207084656},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.45617246627807617},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.451386034488678},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.3715638816356659},{"id":"https://openalex.org/keywords/pattern-recognition","display_name":"Pattern recognition (psychology)","score":0.35764771699905396},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3284671902656555},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.3236387372016907},{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.08014199137687683}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6302649974822998},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.6101363897323608},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.5374079346656799},{"id":"https://openalex.org/C198531522","wikidata":"https://www.wikidata.org/wiki/Q485146","display_name":"Sample (material)","level":2,"score":0.5294777154922485},{"id":"https://openalex.org/C196083921","wikidata":"https://www.wikidata.org/wiki/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.5268912315368652},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5165156722068787},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.5086160898208618},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5029410719871521},{"id":"https://openalex.org/C49898467","wikidata":"https://www.wikidata.org/wiki/Q1517706","display_name":"Stratified sampling","level":2,"score":0.47446209192276},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.4734484851360321},{"id":"https://openalex.org/C129848803","wikidata":"https://www.wikidata.org/wiki/Q2564360","display_name":"Sample size determination","level":2,"score":0.4658281207084656},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.45617246627807617},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.451386034488678},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.3715638816356659},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.35764771699905396},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3284671902656555},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3236387372016907},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.08014199137687683},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.0},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.0},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.0},{"id":"https://openalex.org/C121955636","wikidata":"https://www.wikidata.org/wiki/Q4116214","display_name":"Accounting","level":1,"score":0.0},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/1526709.1526866","is_oa":false,"landing_page_url":"https://doi.org/10.1145/1526709.1526866","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 18th international conference on World wide web","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.215.1523","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.215.1523","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www2009.org/proceedings/pdf/p1081.pdf","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W1979495886"],"related_works":["https://openalex.org/W2142706325","https://openalex.org/W2373268560","https://openalex.org/W1495904142","https://openalex.org/W2356880469","https://openalex.org/W2348862395","https://openalex.org/W2164051890","https://openalex.org/W2770035612","https://openalex.org/W4293088233","https://openalex.org/W2154622657","https://openalex.org/W2221502901"],"abstract_inverted_index":{"We":[0],"propose":[1],"a":[2,51,88,97],"novel":[3,98],"cost-efficient":[4],"approach":[5,99],"to":[6,49,111],"threshold":[7,41,86,134,139],"selection":[8,87],"for":[9,40,133],"binary":[10],"web-page":[11,119],"classification":[12],"problems":[13],"with":[14],"imbalanced":[15],"class":[16,115],"distributions.":[17],"In":[18,29],"many":[19],"binary-classification":[20],"tasks":[21],"the":[22,58,62,75,113,118,131],"distribution":[23,116],"of":[24,55,57,77,93,117],"classes":[25],"is":[26,68,96],"highly":[27],"skewed.":[28],"such":[30,148],"problems,":[31],"using":[32,126,142],"uniform":[33],"random":[34],"sampling":[35],"in":[36,47,104,137],"constructing":[37],"sample":[38,45,78],"sets":[39,79],"setting":[42,135],"requires":[43],"large":[44],"sizes":[46],"order":[48],"include":[50],"statistically":[52],"sufficient":[53],"number":[54],"examples":[56,67,108],"minority":[59],"class.":[60],"On":[61],"other":[63,143],"hand,":[64],"manually":[65,106],"labeling":[66],"expensive":[69],"and":[70,151],"budgetary":[71],"considerations":[72],"require":[73],"that":[74,125],"size":[76],"be":[80],"limited.":[81],"These":[82],"conflicting":[83],"requirements":[84],"make":[85],"challenging":[89],"problem.":[90],"Our":[91,121],"method":[92],"sample-set":[94],"construction":[95],"based":[100],"on":[101],"stratified":[102],"sampling,":[103],"which":[105],"labeled":[107],"are":[109],"expanded":[110],"reflect":[112],"true":[114],"population.":[120],"experimental":[122],"results":[123,136],"show":[124],"false":[127],"positive":[128],"rate":[129],"as":[130,149],"criterion":[132],"lower-variance":[138],"estimates":[140],"than":[141],"widely":[144],"used":[145],"accuracy":[146],"measures":[147],"F1":[150],"precision.":[152]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2014,"cited_by_count":2},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
