{"id":"https://openalex.org/W3133594109","doi":"https://doi.org/10.1145/3404835.3463254","title":"Simplified Data Wrangling with ir_datasets","display_name":"Simplified Data Wrangling with ir_datasets","publication_year":2021,"publication_date":"2021-07-11","ids":{"openalex":"https://openalex.org/W3133594109","doi":"https://doi.org/10.1145/3404835.3463254","mag":"3133594109"},"language":"en","primary_location":{"id":"doi:10.1145/3404835.3463254","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3404835.3463254","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2103.02280","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Sean MacAvaney","orcid":null},"institutions":[{"id":"https://openalex.org/I7882870","display_name":"University of Glasgow","ror":"https://ror.org/00vtgdb53","country_code":"GB","type":"education","lineage":["https://openalex.org/I7882870"]},{"id":"https://openalex.org/I184565670","display_name":"Georgetown University","ror":"https://ror.org/05vzafd60","country_code":"US","type":"education","lineage":["https://openalex.org/I184565670"]}],"countries":["GB","US"],"is_corresponding":true,"raw_author_name":"Sean MacAvaney","raw_affiliation_strings":["University of Glasgow &amp; Georgetown University, Glasgow, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Glasgow &amp; Georgetown University, Glasgow, United Kingdom","institution_ids":["https://openalex.org/I184565670","https://openalex.org/I7882870"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Andrew Yates","orcid":null},"institutions":[{"id":"https://openalex.org/I4210109712","display_name":"Max Planck Institute for Informatics","ror":"https://ror.org/01w19ak89","country_code":"DE","type":"facility","lineage":["https://openalex.org/I149899117","https://openalex.org/I4210109712"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Andrew Yates","raw_affiliation_strings":["Max Planck Institute for Informatics, Saarbr\u00fccken, Germany"],"affiliations":[{"raw_affiliation_string":"Max Planck Institute for Informatics, Saarbr\u00fccken, Germany","institution_ids":["https://openalex.org/I4210109712"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Sergey Feldman","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140341","display_name":"Allen Institute","ror":"https://ror.org/03cpe7c52","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210140341"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sergey Feldman","raw_affiliation_strings":["Allen Institute for AI, Seattle, WA, USA"],"affiliations":[{"raw_affiliation_string":"Allen Institute for AI, Seattle, WA, USA","institution_ids":["https://openalex.org/I4210140341"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Doug Downey","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140341","display_name":"Allen Institute","ror":"https://ror.org/03cpe7c52","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210140341"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Doug Downey","raw_affiliation_strings":["Allen Institute for AI, Seattle, WA, USA"],"affiliations":[{"raw_affiliation_string":"Allen Institute for AI, Seattle, WA, USA","institution_ids":["https://openalex.org/I4210140341"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Arman Cohan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210140341","display_name":"Allen Institute","ror":"https://ror.org/03cpe7c52","country_code":"US","type":"nonprofit","lineage":["https://openalex.org/I4210140341"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arman Cohan","raw_affiliation_strings":["Allen Institute for AI, Seattle, WA, USA"],"affiliations":[{"raw_affiliation_string":"Allen Institute for AI, Seattle, WA, USA","institution_ids":["https://openalex.org/I4210140341"]}]},{"author_position":"last","author":{"id":null,"display_name":"Nazli Goharian","orcid":null},"institutions":[{"id":"https://openalex.org/I184565670","display_name":"Georgetown University","ror":"https://ror.org/05vzafd60","country_code":"US","type":"education","lineage":["https://openalex.org/I184565670"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nazli Goharian","raw_affiliation_strings":["Georgetown University, Washington, DC, USA"],"affiliations":[{"raw_affiliation_string":"Georgetown University, Washington, DC, USA","institution_ids":["https://openalex.org/I184565670"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I184565670","https://openalex.org/I7882870"],"apc_list":null,"apc_paid":null,"fwci":7.0404,"has_fulltext":true,"cited_by_count":82,"citation_normalized_percentile":{"value":0.974371,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"2429","last_page":"2436"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9926999807357788,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/python","display_name":"Python (programming language)","score":0.7789999842643738},{"id":"https://openalex.org/keywords/documentation","display_name":"Documentation","score":0.7282999753952026},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.6359999775886536},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.5408999919891357},{"id":"https://openalex.org/keywords/the-internet","display_name":"The Internet","score":0.4758000075817108},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.447299987077713},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.33739998936653137}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7986999750137329},{"id":"https://openalex.org/C519991488","wikidata":"https://www.wikidata.org/wiki/Q28865","display_name":"Python (programming language)","level":2,"score":0.7789999842643738},{"id":"https://openalex.org/C56666940","wikidata":"https://www.wikidata.org/wiki/Q788790","display_name":"Documentation","level":2,"score":0.7282999753952026},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.6359999775886536},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6161999702453613},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.5408999919891357},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4839000105857849},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.4758000075817108},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.447299987077713},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.40560001134872437},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4011000096797943},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.33739998936653137},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C2780977526","wikidata":"https://www.wikidata.org/wiki/Q42417149","display_name":"Data exploration","level":3,"score":0.265500009059906},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.26190000772476196},{"id":"https://openalex.org/C138958017","wikidata":"https://www.wikidata.org/wiki/Q190087","display_name":"Data type","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.2540999948978424},{"id":"https://openalex.org/C2780129039","wikidata":"https://www.wikidata.org/wiki/Q1931107","display_name":"Section (typography)","level":2,"score":0.25380000472068787},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C99613125","wikidata":"https://www.wikidata.org/wiki/Q165194","display_name":"Application programming interface","level":2,"score":0.25130000710487366}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3404835.3463254","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3404835.3463254","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2103.02280","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2103.02280","pdf_url":"https://arxiv.org/pdf/2103.02280","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:eprints.gla.ac.uk:240149","is_oa":true,"landing_page_url":"https://eprints.gla.ac.uk/view/author/60888.html>","pdf_url":"https://eprints.gla.ac.uk/240149/2/240149.pdf","source":{"id":"https://openalex.org/S4210235606","display_name":"ENLIGHTEN (Jurnal Bimbingan dan Konseling Islam)","issn_l":"2622-8912","issn":["2622-8912","2622-8920"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"PeerReviewed"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2103.02280","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2103.02280","pdf_url":"https://arxiv.org/pdf/2103.02280","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320310358","display_name":"Achievement Rewards for College Scientists Foundation","ror":"https://ror.org/054awkm93"},{"id":"https://openalex.org/F4320317052","display_name":"Allen Institute for Artificial Intelligence","ror":"https://ror.org/05w520734"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2154884126","https://openalex.org/W2256784804","https://openalex.org/W2470818894","https://openalex.org/W2740321901","https://openalex.org/W2740492458","https://openalex.org/W2741544939","https://openalex.org/W2798329376","https://openalex.org/W2887107689","https://openalex.org/W2955732934","https://openalex.org/W2963339397","https://openalex.org/W3001344098","https://openalex.org/W3021244424","https://openalex.org/W3045745713","https://openalex.org/W3085914694","https://openalex.org/W3094297635","https://openalex.org/W3128581554","https://openalex.org/W3136473512","https://openalex.org/W3152887675","https://openalex.org/W3156817235","https://openalex.org/W6629418870","https://openalex.org/W6795224213","https://openalex.org/W6969259846"],"related_works":[],"abstract_inverted_index":{"Managing":[0],"the":[1,16,25,109,125,136],"data":[2,31,158],"for":[3,49,66,85,146],"Information":[4],"Retrieval":[5],"(IR)":[6],"experiments":[7],"can":[8,39],"be":[9,47],"challenging.":[10],"Dataset":[11],"documentation":[12,131],"is":[13,108],"scattered":[14],"across":[15],"Internet":[17],"and":[18,62,69,94,102,121,174,180],"once":[19],"one":[20],"obtains":[21],"a":[22,59,92,144],"copy":[23],"of":[24,113,132],"data,":[26],"there":[27],"are":[28],"numerous":[29,99],"different":[30],"formats":[32,38],"to":[33,46,98,166,176,178],"work":[34],"with.":[35],"Even":[36],"basic":[37],"have":[40],"subtle":[41],"dataset-specific":[42],"nuances":[43],"that":[44],"need":[45],"considered":[48],"proper":[50],"use.":[51],"To":[52,104],"help":[53],"mitigate":[54],"these":[55,133],"challenges,":[56],"we":[57],"introduce":[58],"new":[60],"robust":[61],"lightweight":[63],"tool":[64,89,112],"(ir_datasets)":[65],"acquiring,":[67],"managing,":[68],"performing":[70],"typical":[71],"operations":[72],"over":[73],"datasets":[74,83,101,134,149],"used":[75,84,150],"in":[76,151],"IR.":[77],"We":[78,128,170],"primarily":[79],"focus":[80],"on":[81,148],"textual":[82],"ad-hoc":[86],"search.":[87],"This":[88],"provides":[90,161],"both":[91],"Python":[93],"command":[95],"line":[96],"interface":[97],"IR":[100,119],"benchmarks.":[103],"our":[105],"knowledge,":[106],"this":[107,182],"most":[110],"extensive":[111],"its":[114],"kind.":[115],"Integrations":[116],"with":[117],"popular":[118],"indexing":[120],"experimentation":[122],"toolkits":[123],"demonstrate":[124],"tool's":[126],"utility.":[127],"also":[129],"provide":[130],"through":[135],"\\sys":[137],"catalog:":[138],"https://ir-datasets.com/.":[139],"The":[140],"catalog":[141],"acts":[142],"as":[143,162,164],"hub":[145],"information":[147,155],"IR,":[152],"providing":[153],"core":[154],"about":[156],"what":[157],"each":[159],"benchmark":[160],"well":[163],"links":[165],"more":[167],"detailed":[168],"information.":[169],"welcome":[171],"community":[172],"contributions":[173],"intend":[175],"continue":[177],"maintain":[179],"grow":[181],"tool.":[183]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":28},{"year":2024,"cited_by_count":19},{"year":2023,"cited_by_count":9},{"year":2022,"cited_by_count":20},{"year":2021,"cited_by_count":2}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2021-03-15T00:00:00"}
