{"id":"https://openalex.org/W3088908336","doi":"https://doi.org/10.3233/faia200620","title":"Evaluating Sentence Segmentation and Word Tokenization Systems on Estonian Web Texts","display_name":"Evaluating Sentence Segmentation and Word Tokenization Systems on Estonian Web Texts","publication_year":2020,"publication_date":"2020-09-15","ids":{"openalex":"https://openalex.org/W3088908336","doi":"https://doi.org/10.3233/faia200620","mag":"3088908336"},"language":"en","primary_location":{"id":"doi:10.3233/faia200620","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia200620","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA200620","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA200620","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035358986","display_name":"Kairit Sirts","orcid":"https://orcid.org/0000-0001-7388-2583"},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":true,"raw_author_name":"Kairit Sirts","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5053613876","display_name":"Kairit Peekman","orcid":null},"institutions":[{"id":"https://openalex.org/I56085075","display_name":"University of Tartu","ror":"https://ror.org/03z77qz90","country_code":"EE","type":"education","lineage":["https://openalex.org/I56085075"]}],"countries":["EE"],"is_corresponding":false,"raw_author_name":"Kairit Peekman","raw_affiliation_strings":["Institute of Computer Science, University of Tartu, Estonia"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Tartu, Estonia","institution_ids":["https://openalex.org/I56085075"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5035358986"],"corresponding_institution_ids":["https://openalex.org/I56085075"],"apc_list":null,"apc_paid":null,"fwci":2.1975,"has_fulltext":true,"cited_by_count":11,"citation_normalized_percentile":{"value":0.90305375,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":98},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/sentence","display_name":"Sentence","score":0.7875609397888184},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7514276504516602},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.7064404487609863},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6910764575004578},{"id":"https://openalex.org/keywords/estonian","display_name":"Estonian","score":0.6300058364868164},{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.524834930896759},{"id":"https://openalex.org/keywords/lexical-analysis","display_name":"Lexical analysis","score":0.49453994631767273},{"id":"https://openalex.org/keywords/segmentation","display_name":"Segmentation","score":0.46463799476623535},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.3534218668937683},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.33227741718292236}],"concepts":[{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.7875609397888184},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7514276504516602},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.7064404487609863},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6910764575004578},{"id":"https://openalex.org/C2776092919","wikidata":"https://www.wikidata.org/wiki/Q9072","display_name":"Estonian","level":2,"score":0.6300058364868164},{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.524834930896759},{"id":"https://openalex.org/C176982825","wikidata":"https://www.wikidata.org/wiki/Q835922","display_name":"Lexical analysis","level":2,"score":0.49453994631767273},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.46463799476623535},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.3534218668937683},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.33227741718292236},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.3233/faia200620","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia200620","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA200620","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},{"id":"pmh:oai:arXiv.org:2011.07868","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2011.07868","pdf_url":"https://arxiv.org/pdf/2011.07868","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.3233/faia200620","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia200620","pdf_url":"https://ebooks.iospress.nl/pdf/doi/10.3233/FAIA200620","source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.5899999737739563}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3088908336.pdf","grobid_xml":"https://content.openalex.org/works/W3088908336.grobid-xml"},"referenced_works_count":13,"referenced_works":["https://openalex.org/W217017982","https://openalex.org/W1901600440","https://openalex.org/W2121764873","https://openalex.org/W2250978584","https://openalex.org/W2509343702","https://openalex.org/W2513522215","https://openalex.org/W2574640638","https://openalex.org/W2575742401","https://openalex.org/W2740840489","https://openalex.org/W2889106471","https://openalex.org/W3032382737","https://openalex.org/W3037109418","https://openalex.org/W6732432023"],"related_works":["https://openalex.org/W1555924552","https://openalex.org/W2483134411","https://openalex.org/W2970103878","https://openalex.org/W3092317133","https://openalex.org/W4379520044","https://openalex.org/W4237028562","https://openalex.org/W2102248890","https://openalex.org/W2275097958","https://openalex.org/W4316007230","https://openalex.org/W3113881915"],"abstract_inverted_index":{"Texts":[0],"obtained":[1,109],"from":[2],"web":[3,39,56],"are":[4],"noisy":[5],"and":[6,14,21,58,69,78,102],"do":[7],"not":[8,33],"necessarily":[9],"follow":[10],"the":[11,47,61,83,96,107,111],"orthographic":[12],"sentence":[13,19,51,67,91,97],"word":[15,22,70],"boundary":[16],"rules.":[17],"Thus,":[18],"segmentation":[20,68,92,98],"tokenization":[23,71],"systems":[24,72,89],"that":[25],"have":[26],"been":[27],"developed":[28],"on":[29,37,73,90,93,110],"well-formed":[30,113],"texts":[31],"might":[32],"perform":[34],"so":[35],"well":[36,105],"unedited":[38],"texts.":[40],"In":[41],"this":[42,74,94],"paper,":[43],"we":[44],"first":[45],"describe":[46],"manual":[48],"annotation":[49],"of":[50,53,64,100],"boundaries":[52],"an":[54],"Estonian":[55,114],"dataset":[57],"then":[59],"present":[60],"evaluation":[62],"results":[63,108],"three":[65],"existing":[66],"corpus:":[75],"EstNLTK,":[76],"Stanza":[77,101],"UDPipe.":[79],"While":[80],"EstNLTK":[81],"obtains":[82],"highest":[84],"performance":[85,99],"compared":[86],"to":[87],"other":[88],"dataset,":[95],"UDPipe":[103],"remains":[104],"below":[106],"more":[112],"UD":[115],"test":[116],"set.":[117]},"counts_by_year":[{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":6},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2020-10-01T00:00:00"}
