{"id":"https://openalex.org/W7147261298","doi":"https://doi.org/10.48550/arxiv.2603.29221","title":"SiPaKosa: A Comprehensive Corpus of Canonical and Classical Buddhist Texts in Sinhala and Pali","display_name":"SiPaKosa: A Comprehensive Corpus of Canonical and Classical Buddhist Texts in Sinhala and Pali","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7147261298","doi":"https://doi.org/10.48550/arxiv.2603.29221"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29221","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132558024","display_name":"Ranidu Gurusinghe","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gurusinghe, Ranidu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5115552982","display_name":"Nevidu Jayatilleke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jayatilleke, Nevidu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5132558024"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11563","display_name":"Indian and Buddhist Studies","score":0.2513999938964844,"subfield":{"id":"https://openalex.org/subfields/1212","display_name":"Religious studies"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11563","display_name":"Indian and Buddhist Studies","score":0.2513999938964844,"subfield":{"id":"https://openalex.org/subfields/1212","display_name":"Religious studies"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.17139999568462372,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12377","display_name":"Digital Humanities and Scholarship","score":0.05079999938607216,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.7177000045776367},{"id":"https://openalex.org/keywords/buddhism","display_name":"Buddhism","score":0.6190999746322632},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.46639999747276306},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.423799991607666},{"id":"https://openalex.org/keywords/scholarship","display_name":"Scholarship","score":0.38690000772476196},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.36820000410079956}],"concepts":[{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.7177000045776367},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6363999843597412},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.6265000104904175},{"id":"https://openalex.org/C75699723","wikidata":"https://www.wikidata.org/wiki/Q748","display_name":"Buddhism","level":2,"score":0.6190999746322632},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5501000285148621},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.5317999720573425},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.423799991607666},{"id":"https://openalex.org/C2778061430","wikidata":"https://www.wikidata.org/wiki/Q188823","display_name":"Scholarship","level":2,"score":0.38690000772476196},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.36820000410079956},{"id":"https://openalex.org/C2777231075","wikidata":"https://www.wikidata.org/wiki/Q178715","display_name":"Rhyme","level":3,"score":0.33820000290870667},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.31940001249313354},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.3163999915122986},{"id":"https://openalex.org/C532629269","wikidata":"https://www.wikidata.org/wiki/Q865083","display_name":"Corpus linguistics","level":2,"score":0.3156000077724457},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.30070000886917114},{"id":"https://openalex.org/C95457728","wikidata":"https://www.wikidata.org/wiki/Q309","display_name":"History","level":0,"score":0.2915000021457672},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C124952713","wikidata":"https://www.wikidata.org/wiki/Q8242","display_name":"Literature","level":1,"score":0.2614000141620636}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29221","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29221","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Sustainable cities and communities","score":0.5543058514595032,"id":"https://metadata.un.org/sdg/11"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"SiPaKosa":[0],"is":[1,63],"a":[2],"comprehensive":[3],"corpus":[4,32,62,112],"of":[5,50,76,106,116,129],"Sinhala":[6,68,138],"and":[7,15,58,69,124],"Pali":[8],"doctrinal":[9],"texts":[10],"comprising":[11],"approximately":[12],"786K":[13],"sentences":[14],"9.25M":[16],"words,":[17],"incorporating":[18],"16":[19],"copyright-cleared":[20],"historical":[21,43,121],"Buddhist":[22,134],"documents":[23],"alongside":[24],"the":[25,74,114,127],"complete":[26],"web-scraped":[27],"Tripitaka":[28],"canonical":[29,51],"texts.":[30],"The":[31,61],"was":[33],"created":[34],"through":[35],"high-quality":[36],"OCR":[37],"using":[38,79],"Google":[39],"Document":[40],"AI":[41],"on":[42,91],"manuscripts,":[44],"combined":[45],"with":[46,83],"systematic":[47],"web":[48],"scraping":[49],"repositories,":[52],"followed":[53],"by":[54,104],"rigorous":[55],"quality":[56],"control":[57],"metadata":[59],"annotation.":[60],"organised":[64],"into":[65],"language-specific":[66],"subcorpora:":[67],"Mixed":[70],"Sinhala-Pali.":[71],"We":[72],"evaluate":[73],"performance":[75],"language":[77,118,122],"models":[78,99],"ten":[80],"pretrained":[81],"models,":[82,119],"perplexity":[84],"scores":[85],"ranging":[86],"from":[87],"1.09":[88],"to":[89,108],"189.67":[90],"our":[92],"corpus.":[93],"This":[94,111],"analysis":[95],"shows":[96],"that":[97],"proprietary":[98],"significantly":[100],"outperform":[101],"open-source":[102],"alternatives":[103],"factors":[105],"three":[107],"six":[109],"times.":[110],"supports":[113],"pretraining":[115],"domain-adapted":[117],"facilitates":[120],"analysis,":[123],"aids":[125],"in":[126],"development":[128],"information":[130],"retrieval":[131],"systems":[132],"for":[133],"scholarship":[135],"while":[136],"preserving":[137],"cultural":[139],"heritage.":[140]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
