{"id":"https://openalex.org/W3035303465","doi":"https://doi.org/10.1145/3397271.3401212","title":"Sampling Bias Due to Near-Duplicates in Learning to Rank","display_name":"Sampling Bias Due to Near-Duplicates in Learning to Rank","publication_year":2020,"publication_date":"2020-07-25","ids":{"openalex":"https://openalex.org/W3035303465","doi":"https://doi.org/10.1145/3397271.3401212","mag":"3035303465"},"language":"en","primary_location":{"id":"doi:10.1145/3397271.3401212","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3397271.3401212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5068616222","display_name":"Maik Fr\u00f6be","orcid":"https://orcid.org/0000-0002-1003-981X"},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Maik Fr\u00f6be","raw_affiliation_strings":["Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079524270","display_name":"Janek Bevendorff","orcid":"https://orcid.org/0000-0002-3797-0559"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"education","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Janek Bevendorff","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar, Weimar, Germany"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar, Weimar, Germany","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058247097","display_name":"Jan Heinrich Reimer","orcid":"https://orcid.org/0000-0003-1992-8696"},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Jan Heinrich Reimer","raw_affiliation_strings":["Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083712311","display_name":"Martin Potthast","orcid":"https://orcid.org/0000-0003-2451-0665"},"institutions":[{"id":"https://openalex.org/I926574661","display_name":"Leipzig University","ror":"https://ror.org/03s7gtk40","country_code":"DE","type":"education","lineage":["https://openalex.org/I926574661"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Martin Potthast","raw_affiliation_strings":["Leipzig University, Leipzig, Germany"],"affiliations":[{"raw_affiliation_string":"Leipzig University, Leipzig, Germany","institution_ids":["https://openalex.org/I926574661"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5014322854","display_name":"Matthias Hagen","orcid":"https://orcid.org/0000-0002-9733-2890"},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Matthias Hagen","raw_affiliation_strings":["Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-Universit\u00e4t Halle-Wittenberg, Halle, Germany","institution_ids":["https://openalex.org/I68956291"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5068616222"],"corresponding_institution_ids":["https://openalex.org/I68956291"],"apc_list":null,"apc_paid":null,"fwci":3.4698,"has_fulltext":false,"cited_by_count":15,"citation_normalized_percentile":{"value":0.93749446,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1997","last_page":"2000"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.7819689512252808},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7779666185379028},{"id":"https://openalex.org/keywords/pairwise-comparison","display_name":"Pairwise comparison","score":0.7623540759086609},{"id":"https://openalex.org/keywords/pointwise","display_name":"Pointwise","score":0.7132371664047241},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.6081916689872742},{"id":"https://openalex.org/keywords/ranking","display_name":"Ranking (information retrieval)","score":0.6028687953948975},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.577966034412384},{"id":"https://openalex.org/keywords/learning-to-rank","display_name":"Learning to rank","score":0.49011215567588806},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.47210636734962463},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.46252474188804626},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4391290545463562},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4296036958694458},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4140564203262329},{"id":"https://openalex.org/keywords/rank","display_name":"Rank (graph theory)","score":0.4133393168449402},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10427600145339966}],"concepts":[{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.7819689512252808},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7779666185379028},{"id":"https://openalex.org/C184898388","wikidata":"https://www.wikidata.org/wiki/Q1435712","display_name":"Pairwise comparison","level":2,"score":0.7623540759086609},{"id":"https://openalex.org/C2777984123","wikidata":"https://www.wikidata.org/wiki/Q9248237","display_name":"Pointwise","level":2,"score":0.7132371664047241},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.6081916689872742},{"id":"https://openalex.org/C189430467","wikidata":"https://www.wikidata.org/wiki/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.6028687953948975},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.577966034412384},{"id":"https://openalex.org/C86037889","wikidata":"https://www.wikidata.org/wiki/Q4330127","display_name":"Learning to rank","level":3,"score":0.49011215567588806},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.47210636734962463},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.46252474188804626},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4391290545463562},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4296036958694458},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4140564203262329},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.4133393168449402},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10427600145339966},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.0},{"id":"https://openalex.org/C106131492","wikidata":"https://www.wikidata.org/wiki/Q3072260","display_name":"Filter (signal processing)","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3397271.3401212","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3397271.3401212","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","score":0.4399999976158142,"id":"https://metadata.un.org/sdg/4"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W60779023","https://openalex.org/W361155034","https://openalex.org/W1530210183","https://openalex.org/W1685426458","https://openalex.org/W1990589796","https://openalex.org/W2031160476","https://openalex.org/W2032536435","https://openalex.org/W2037022968","https://openalex.org/W2047804176","https://openalex.org/W2064392127","https://openalex.org/W2082146655","https://openalex.org/W2083730062","https://openalex.org/W2139532006","https://openalex.org/W2162059449","https://openalex.org/W2340526403","https://openalex.org/W2507134384","https://openalex.org/W2740321901","https://openalex.org/W2769473018","https://openalex.org/W2884475480","https://openalex.org/W2889742741","https://openalex.org/W2891508226","https://openalex.org/W2945127593","https://openalex.org/W2951534261","https://openalex.org/W2953290265","https://openalex.org/W3000055169","https://openalex.org/W3012600133","https://openalex.org/W3015779532","https://openalex.org/W3099446234","https://openalex.org/W3101148092","https://openalex.org/W3102882112","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W2112815677","https://openalex.org/W2767338541","https://openalex.org/W3216317163","https://openalex.org/W2011472225","https://openalex.org/W3000057026","https://openalex.org/W4313908216","https://openalex.org/W3048565508","https://openalex.org/W3163984363","https://openalex.org/W3127142483","https://openalex.org/W4385565564"],"abstract_inverted_index":{"Learning":[0],"to":[1,42],"rank~(LTR)":[2],"is":[3],"the":[4,30,33,43,51,78,120],"de":[5],"facto":[6],"standard":[7],"for":[8,29,119],"web":[9,48],"search,":[10],"improving":[11],"upon":[12],"classical":[13],"retrieval":[14],"models":[15],"by":[16],"exploiting":[17],"(in)direct":[18],"relevance":[19,57],"feedback":[20,58],"from":[21],"user":[22],"judgments,":[23],"interaction":[24],"logs,":[25],"etc.":[26],"We":[27,86],"investigate":[28],"first":[31],"time":[32],"effect":[34],"of":[35,46,59,73,84,122],"a":[36,71,116],"sampling":[37],"bias":[38],"on":[39,77,96],"LTR~models":[40],"due":[41],"potential":[44],"presence":[45],"near-duplicate":[47],"pages":[49],"in":[50],"training":[52,126],"data,":[53],"and":[54,89,100,110,127],"how":[55],"(in)consistent":[56],"duplicates":[60],"influences":[61],"an":[62],"LTR~model's":[63],"decisions.":[64],"To":[65],"examine":[66],"this":[67],"bias,":[68],"we":[69],"construct":[70],"series":[72],"specialized":[74],"LTR~datasets":[75],"based":[76],"ClueWeb09":[79],"corpus":[80],"with":[81],"varying":[82],"amounts":[83],"near-duplicates.":[85],"devise":[87],"worst-case":[88],"average-case":[90],"train/test":[91],"splits":[92],"that":[93,106],"are":[94],"evaluated":[95],"popular":[97],"pointwise,":[98],"pairwise,":[99],"listwise":[101],"LTR~models.":[102],"Our":[103],"experiments":[104],"demonstrate":[105],"duplication":[107],"causes":[108],"overfitting":[109],"thus":[111],"less":[112],"effective":[113],"models,":[114],"making":[115],"strong":[117],"case":[118],"benefits":[121],"systematic":[123],"deduplication":[124],"before":[125],"model":[128],"evaluation.":[129]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
