{"id":"https://openalex.org/W7139965095","doi":"https://doi.org/10.1016/j.procs.2026.01.005","title":"PAN-KK: A Language Resource for Plagiarism Detection in Low-Resource Kazakh","display_name":"PAN-KK: A Language Resource for Plagiarism Detection in Low-Resource Kazakh","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7139965095","doi":"https://doi.org/10.1016/j.procs.2026.01.005"},"language":"en","primary_location":{"id":"doi:10.1016/j.procs.2026.01.005","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.005","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1016/j.procs.2026.01.005","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5005951511","display_name":"Bakhyt Bakiyev","orcid":"https://orcid.org/0000-0002-7585-6626"},"institutions":[{"id":"https://openalex.org/I79619799","display_name":"University of Birmingham","ror":"https://ror.org/03angcq70","country_code":"GB","type":"education","lineage":["https://openalex.org/I79619799"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Bakhyt Bakiyev","raw_affiliation_strings":["School of Computer Science, University of Birmingham, Birmingham, UK"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Birmingham, Birmingham, UK","institution_ids":["https://openalex.org/I79619799"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130238141","display_name":"Venelin Kovatchev","orcid":null},"institutions":[{"id":"https://openalex.org/I79619799","display_name":"University of Birmingham","ror":"https://ror.org/03angcq70","country_code":"GB","type":"education","lineage":["https://openalex.org/I79619799"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Venelin Kovatchev","raw_affiliation_strings":["School of Computer Science, University of Birmingham, Birmingham, UK"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Birmingham, Birmingham, UK","institution_ids":["https://openalex.org/I79619799"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5122802966","display_name":"Mubashir Ali","orcid":null},"institutions":[{"id":"https://openalex.org/I79619799","display_name":"University of Birmingham","ror":"https://ror.org/03angcq70","country_code":"GB","type":"education","lineage":["https://openalex.org/I79619799"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Mubashir Ali","raw_affiliation_strings":["School of Computer Science, University of Birmingham, Birmingham, UK"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, University of Birmingham, Birmingham, UK","institution_ids":["https://openalex.org/I79619799"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5005951511"],"corresponding_institution_ids":["https://openalex.org/I79619799"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.88527521,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"275","issue":null,"first_page":"28","last_page":"37"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.5206999778747559,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12380","display_name":"Authorship Attribution and Profiling","score":0.5206999778747559,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11492","display_name":"Academic integrity and plagiarism","score":0.2770000100135803,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.02710000053048134,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kazakh","display_name":"Kazakh","score":0.8712999820709229},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.6811000108718872},{"id":"https://openalex.org/keywords/plagiarism-detection","display_name":"Plagiarism detection","score":0.5328999757766724},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.2694000005722046}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9067000150680542},{"id":"https://openalex.org/C2781297163","wikidata":"https://www.wikidata.org/wiki/Q9252","display_name":"Kazakh","level":2,"score":0.8712999820709229},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.6811000108718872},{"id":"https://openalex.org/C2780907237","wikidata":"https://www.wikidata.org/wiki/Q2986238","display_name":"Plagiarism detection","level":2,"score":0.5328999757766724},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.4408999979496002},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.41589999198913574},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.36970001459121704},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.36910000443458557},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3472000062465668},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.27720001339912415},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.2694000005722046}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1016/j.procs.2026.01.005","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.005","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},{"id":"pmh:oai:pure.atira.dk:publications/d75c427c-b634-4d35-adfe-b097c0edefca","is_oa":false,"landing_page_url":"https://research.birmingham.ac.uk/en/publications/d75c427c-b634-4d35-adfe-b097c0edefca","pdf_url":null,"source":{"id":"https://openalex.org/S4306402634","display_name":"University of Birmingham Research Portal (University of Birmingham)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79619799","host_organization_name":"University of Birmingham","host_organization_lineage":["https://openalex.org/I79619799"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Bakiyev, B, Kovatchev, V & Ali, M 2026, 'PAN-KK : A Language Resource for Plagiarism Detection in Low-Resource Kazakh', Procedia Computer Science, vol. 275, 18, pp. 28-37. https://doi.org/10.1016/j.procs.2026.01.005","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:pure.atira.dk:openaire/d75c427c-b634-4d35-adfe-b097c0edefca","is_oa":true,"landing_page_url":"https://www.sciencedirect.com/science/article/pii/S1877050926000050","pdf_url":null,"source":{"id":"https://openalex.org/S4306402634","display_name":"University of Birmingham Research Portal (University of Birmingham)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79619799","host_organization_name":"University of Birmingham","host_organization_lineage":["https://openalex.org/I79619799"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Bakiyev, B, Kovatchev, V & Ali, M 2026, 'PAN-KK : A Language Resource for Plagiarism Detection in Low-Resource Kazakh', Procedia Computer Science, vol. 275, 18, pp. 28-37. https://doi.org/10.1016/j.procs.2026.01.005","raw_type":"info:eu-repo/semantics/publishedVersion"}],"best_oa_location":{"id":"doi:10.1016/j.procs.2026.01.005","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.005","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W1757621696","https://openalex.org/W2028776121","https://openalex.org/W2141766660","https://openalex.org/W2462305634","https://openalex.org/W2970641574","https://openalex.org/W2970752815","https://openalex.org/W2979826702","https://openalex.org/W3035390927","https://openalex.org/W4309876079","https://openalex.org/W4400680016","https://openalex.org/W4405112722","https://openalex.org/W4408040045","https://openalex.org/W4408209280","https://openalex.org/W4412945357","https://openalex.org/W7131824790","https://openalex.org/W7131904167"],"related_works":[],"abstract_inverted_index":{"To":[0],"the":[1,8,15,73,112,115,122,125,133,141,144],"best":[2],"of":[3,17,29,33,49,66,83,114,124,135,143,146],"our":[4],"knowledge,":[5],"we":[6,86],"introduce":[7],"first":[9],"dataset,":[10],"which":[11,95],"was":[12,25],"created":[13],"with":[14,99],"intention":[16],"extrinsic":[18],"plagiarism":[19,147],"detection":[20,148],"in":[21,36,40,91,149],"Kazakh.":[22,154],"The":[23],"PAN-KK":[24,71],"developed":[26],"through":[27],"translation":[28],"over":[30],"20,000":[31],"pairs":[32,107],"plagiarised":[34],"text":[35],"English":[37],"that":[38,108],"were":[39],"PAN":[41],"2010":[42],"and":[43,62,72,102,121,137],"it":[44],"includes":[45],"a":[46,64,92],"human-verified":[47],"subsample":[48],"2,000":[50],"samples,":[51],"each":[52],"being":[53],"scrutinised":[54],"by":[55],"10":[56],"native":[57],"Kazakh":[58],"speakers.":[59],"We":[60],"prepare":[61],"evaluate":[63],"range":[65],"different":[67],"transformer-based":[68],"models":[69],"using":[70],"most":[74],"successful":[75],"model":[76],"is":[77,117,127],"XLM-RoBERTa":[78,89],"Large.":[79],"As":[80],"an":[81],"example":[82],"practical":[84],"use,":[85],"next":[87],"use":[88],"Large":[90],"two-stage":[93],"pipeline":[94,139],"integrates":[96],"candidate":[97],"retrieval":[98],"semantic":[100],"verification":[101],"span":[103],"localisation.":[104],"In":[105,131],"1000":[106],"are":[109],"manually":[110],"annotated,":[111],"localisation":[113,123],"tokens":[116],"P=0.929,":[118],"R=0.843,":[119],"F1=0.868,":[120],"spans":[126],"P=0.718,":[128],"R=0.649,":[129],"F1=0.667.":[130],"general,":[132],"aid":[134],"PAN\u2013KK":[136],"its":[138],"facilitates":[140],"development":[142],"field":[145],"low-resource":[150],"languages":[151],"such":[152],"as":[153]},"counts_by_year":[],"updated_date":"2026-03-27T14:29:43.386196","created_date":"2026-03-21T00:00:00"}
