{"id":"https://openalex.org/W4393695576","doi":"https://doi.org/10.5281/zenodo.3372485","title":"Webis Wikipedia Text Reuse Corpus 2018 (Webis-Wikipedia-Text-Reuse-18)","display_name":"Webis Wikipedia Text Reuse Corpus 2018 (Webis-Wikipedia-Text-Reuse-18)","publication_year":2018,"publication_date":"2018-07-05","ids":{"openalex":"https://openalex.org/W4393695576","doi":"https://doi.org/10.5281/zenodo.3372485"},"language":"en","primary_location":{"id":"pmh:oai:zenodo.org:3372485","is_oa":true,"landing_page_url":"https://zenodo.org/record/3372485","pdf_url":null,"source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},"type":"dataset","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://zenodo.org/record/3372485","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025513989","display_name":"Milad Alshomary","orcid":"https://orcid.org/0000-0001-6142-9124"},"institutions":[{"id":"https://openalex.org/I206945453","display_name":"Paderborn University","ror":"https://ror.org/058kzsd48","country_code":"DE","type":"education","lineage":["https://openalex.org/I206945453"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Alshomary, Milad","raw_affiliation_strings":["Universit\u00e4t Paderborn"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Paderborn","institution_ids":["https://openalex.org/I206945453"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037682495","display_name":"Michael V\u00f6lske","orcid":"https://orcid.org/0000-0002-9283-6846"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"education","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"V\u00f6lske, Michael","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014375244","display_name":"Henning Wachsmuth","orcid":"https://orcid.org/0000-0003-2792-621X"},"institutions":[{"id":"https://openalex.org/I206945453","display_name":"Paderborn University","ror":"https://ror.org/058kzsd48","country_code":"DE","type":"education","lineage":["https://openalex.org/I206945453"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Wachsmuth, Henning","raw_affiliation_strings":["Universit\u00e4t Paderborn"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Paderborn","institution_ids":["https://openalex.org/I206945453"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027915931","display_name":"Benno Stein","orcid":"https://orcid.org/0000-0001-9033-2217"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"education","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Stein, Benno","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014322854","display_name":"Matthias Hagen","orcid":"https://orcid.org/0000-0002-9733-2890"},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Hagen, Matthias","raw_affiliation_strings":["Martin-Luther-Universit\u00e4t Halle-Wittenberg"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-Universit\u00e4t Halle-Wittenberg","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083712311","display_name":"Martin Potthast","orcid":"https://orcid.org/0000-0003-2451-0665"},"institutions":[{"id":"https://openalex.org/I926574661","display_name":"Leipzig University","ror":"https://ror.org/03s7gtk40","country_code":"DE","type":"education","lineage":["https://openalex.org/I926574661"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Potthast, Martin","raw_affiliation_strings":["Universit\u00e4t Leipzig"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Leipzig","institution_ids":["https://openalex.org/I926574661"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5025513989"],"corresponding_institution_ids":["https://openalex.org/I206945453"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9693999886512756,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9693999886512756,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.8632157444953918},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6692189574241638},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.6464004516601562},{"id":"https://openalex.org/keywords/world-wide-web","display_name":"World Wide Web","score":0.5993618965148926},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.12772518396377563},{"id":"https://openalex.org/keywords/ecology","display_name":"Ecology","score":0.04881402850151062}],"concepts":[{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.8632157444953918},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6692189574241638},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6464004516601562},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.5993618965148926},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.12772518396377563},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.04881402850151062}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:zenodo.org:3372485","is_oa":true,"landing_page_url":"https://zenodo.org/record/3372485","pdf_url":null,"source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},{"id":"doi:10.5281/zenodo.3372485","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.3372485","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"dataset"}],"best_oa_location":{"id":"pmh:oai:zenodo.org:3372485","is_oa":true,"landing_page_url":"https://zenodo.org/record/3372485","pdf_url":null,"source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"info:eu-repo/semantics/other"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2384475851","https://openalex.org/W2000444236","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2353602216","https://openalex.org/W2382290278","https://openalex.org/W2478288626"],"abstract_inverted_index":{"The":[0,26,101],"Wikipedia":[1,14,18,36,123,139],"Text":[2],"Reuse":[3],"Corpus":[4],"2018":[5,112],"(Webis-Wikipedia-Text-Reuse-18)":[6],"containing":[7],"text":[8,52,79,118,131],"reuse":[9,53,80,119,132],"cases":[10],"extracted":[11,104],"from":[12],"within":[13],"and":[15,19,45,136,140],"in":[16,105],"between":[17],"a":[20,35,39,51,56,78,83],"sample":[21],"of":[22,42,59,86],"the":[23,106,117,141],"Common":[24],"Crawl.":[25],"corpus":[27],"has":[28],"following":[29],"structure:":[30],"wikipedia.tar.gz:":[31],"Each":[32,48,75],"line,":[33,49,76],"representing":[34,50,77],"article,":[37],"contains":[38,55,82],"json":[40,57,84],"array":[41,58,85],"article_id,":[43],"article_title,":[44],"article_body":[46],"within-wikipedia-tr-01.gz:":[47],"case,":[54,81],"s_id":[60,87],"(source":[61,69,88,96],"article":[62,66,89,93],"id),":[63,67,90,94],"t_id":[64,91],"(target":[65,72,92,99],"s_text":[68,95],"text),":[70,97],"t_text":[71,98],"text)":[73,100],"within-wikipedia-tr-02.gz:":[74],"datasets":[102],"were":[103],"work":[107],"by":[108],"Alshomary":[109],"et":[110],"al.":[111],"that":[113],"aimed":[114],"to":[115,122],"study":[116],"phenomena":[120],"related":[121],"at":[124],"scale.":[125],"A":[126],"pipeline":[127],"for":[128],"large":[129],"scale":[130],"extraction":[133],"was":[134],"developed":[135],"used":[137],"on":[138],"CommonCrawl.":[142]},"counts_by_year":[],"updated_date":"2026-02-05T00:54:17.221276","created_date":"2025-10-10T00:00:00"}
