{"id":"https://openalex.org/W7139961056","doi":"https://doi.org/10.1016/j.procs.2026.01.056","title":"SciMDIX: A dataset for aspect extraction from multi-domain scientific documents in Kazakh and Russian","display_name":"SciMDIX: A dataset for aspect extraction from multi-domain scientific documents in Kazakh and Russian","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7139961056","doi":"https://doi.org/10.1016/j.procs.2026.01.056"},"language":"en","primary_location":{"id":"doi:10.1016/j.procs.2026.01.056","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.056","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://doi.org/10.1016/j.procs.2026.01.056","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047223617","display_name":"Nikita Shvarts","orcid":null},"institutions":[{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]},{"id":"https://openalex.org/I4210152232","display_name":"Institute of Informatics of the Slovak Academy of Sciences","ror":"https://ror.org/04jgqpc26","country_code":"SK","type":"facility","lineage":["https://openalex.org/I207624831","https://openalex.org/I4210152232"]}],"countries":["KZ","SK"],"is_corresponding":false,"raw_author_name":"Nikita Shvarts","raw_affiliation_strings":["A.P. Ershov Institute of Informatics Systems, Novosibirsk 630090, Russia","Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"A.P. Ershov Institute of Informatics Systems, Novosibirsk 630090, Russia","institution_ids":["https://openalex.org/I4210152232"]},{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130238112","display_name":"Tatiana Batura","orcid":null},"institutions":[{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]},{"id":"https://openalex.org/I4210152232","display_name":"Institute of Informatics of the Slovak Academy of Sciences","ror":"https://ror.org/04jgqpc26","country_code":"SK","type":"facility","lineage":["https://openalex.org/I207624831","https://openalex.org/I4210152232"]}],"countries":["KZ","SK"],"is_corresponding":true,"raw_author_name":"Tatiana Batura","raw_affiliation_strings":["A.P. Ershov Institute of Informatics Systems, Novosibirsk 630090, Russia","Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"A.P. Ershov Institute of Informatics Systems, Novosibirsk 630090, Russia","institution_ids":["https://openalex.org/I4210152232"]},{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130245652","display_name":"Nurzhan Mukazhanov","orcid":null},"institutions":[{"id":"https://openalex.org/I3130577743","display_name":"Narxoz University","ror":"https://ror.org/0523w7v09","country_code":"KZ","type":"education","lineage":["https://openalex.org/I3130577743"]},{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Nurzhan Mukazhanov","raw_affiliation_strings":["Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","Narxoz University, Almaty 050057, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]},{"raw_affiliation_string":"Narxoz University, Almaty 050057, Kazakhstan","institution_ids":["https://openalex.org/I3130577743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017370698","display_name":"Aigerim Yerimbetova","orcid":"https://orcid.org/0000-0002-2013-1513"},"institutions":[{"id":"https://openalex.org/I4210114545","display_name":"Almaty Technological University","ror":"https://ror.org/01xeb1c73","country_code":"KZ","type":"education","lineage":["https://openalex.org/I4210114545"]},{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Aigerim Yerimbetova","raw_affiliation_strings":["Eurasian Technological University, Almaty 050012, Kazakhstan","Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Eurasian Technological University, Almaty 050012, Kazakhstan","institution_ids":["https://openalex.org/I4210114545"]},{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046228742","display_name":"Mussa Turdalyuly","orcid":"https://orcid.org/0000-0002-1470-3706"},"institutions":[{"id":"https://openalex.org/I4210114545","display_name":"Almaty Technological University","ror":"https://ror.org/01xeb1c73","country_code":"KZ","type":"education","lineage":["https://openalex.org/I4210114545"]},{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Mussa Turdalyuly","raw_affiliation_strings":["Eurasian Technological University, Almaty 050012, Kazakhstan","Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Eurasian Technological University, Almaty 050012, Kazakhstan","institution_ids":["https://openalex.org/I4210114545"]},{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046791061","display_name":"Bakzhan Sakenov","orcid":"https://orcid.org/0000-0002-9849-6176"},"institutions":[{"id":"https://openalex.org/I4210135055","display_name":"Institute of Information and Computational Technologies","ror":"https://ror.org/03v6e0k54","country_code":"KZ","type":"facility","lineage":["https://openalex.org/I4210135055"]}],"countries":["KZ"],"is_corresponding":false,"raw_author_name":"Bakzhan Sakenov","raw_affiliation_strings":["Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan"],"affiliations":[{"raw_affiliation_string":"Institute of Information and Computational Technologies, Almaty 050010, Kazakhstan","institution_ids":["https://openalex.org/I4210135055"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5130238112"],"corresponding_institution_ids":["https://openalex.org/I4210135055","https://openalex.org/I4210152232"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.93316763,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":null,"biblio":{"volume":"275","issue":null,"first_page":"474","last_page":"483"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.39730000495910645,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.39730000495910645,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T13083","display_name":"Advanced Text Analysis Techniques","score":0.09399999678134918,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.05209999904036522,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kazakh","display_name":"Kazakh","score":0.9491000175476074},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.4796000123023987},{"id":"https://openalex.org/keywords/extraction","display_name":"Extraction (chemistry)","score":0.44279998540878296},{"id":"https://openalex.org/keywords/data-extraction","display_name":"Data extraction","score":0.298799991607666}],"concepts":[{"id":"https://openalex.org/C2781297163","wikidata":"https://www.wikidata.org/wiki/Q9252","display_name":"Kazakh","level":2,"score":0.9491000175476074},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8934999704360962},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.5633000135421753},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.4796000123023987},{"id":"https://openalex.org/C4725764","wikidata":"https://www.wikidata.org/wiki/Q844704","display_name":"Extraction (chemistry)","level":2,"score":0.44279998540878296},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3562999963760376},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3547999858856201},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.34139999747276306},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.31769999861717224},{"id":"https://openalex.org/C2777466982","wikidata":"https://www.wikidata.org/wiki/Q5227287","display_name":"Data extraction","level":3,"score":0.298799991607666},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.26649999618530273}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1016/j.procs.2026.01.056","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.056","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1016/j.procs.2026.01.056","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.procs.2026.01.056","pdf_url":null,"source":{"id":"https://openalex.org/S120348307","display_name":"Procedia Computer Science","issn_l":"1877-0509","issn":["1877-0509"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Procedia Computer Science","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":11,"referenced_works":["https://openalex.org/W2962903510","https://openalex.org/W2963718112","https://openalex.org/W2963793519","https://openalex.org/W4226116726","https://openalex.org/W4389099428","https://openalex.org/W4391836235","https://openalex.org/W4400002812","https://openalex.org/W4402601975","https://openalex.org/W4408326925","https://openalex.org/W4413331108","https://openalex.org/W4413405337"],"related_works":[],"abstract_inverted_index":{"The":[0,37,88,108,123,163,205],"objective":[1],"of":[2,45,61,129,169,192,197,225],"aspect":[3,25,82,175],"extraction":[4,26,83,176],"is":[5,58],"to":[6,112,200],"identify":[7],"the":[8,127,167,190,195],"key":[9],"informational":[10],"elements":[11,110],"in":[12,27,74,84,94,172,182],"a":[13,41,156],"text.":[14],"Although":[15],"aspect-based":[16,53],"sentiment":[17],"analysis":[18],"(ABSA)":[19],"has":[20,34],"extensively":[21],"explored":[22],"this":[23],"field,":[24],"scientific":[28,49,133,226],"texts":[29],"remains":[30],"an":[31,59],"area":[32],"that":[33],"been":[35],"underexplored.":[36],"present":[38,124],"paper":[39],"introduces":[40],"new":[42,157],"multi-domain":[43],"corpus":[44,90],"Russian":[46,95,178],"and":[47,68,77,96,101,121,138,140,155,179,194,208,214,229],"Kazakh":[48],"texts,":[50],"annotated":[51,98],"for":[52,64,81],"information":[54,231],"extraction.":[55,70],"This":[56],"dataset":[57,209],"expansion":[60],"existing":[62],"resources":[63],"named":[65],"entity":[66],"recognition":[67],"relation":[69],"It":[71],"facilitates":[72],"research":[73,186],"cross-lingual":[75],"transfer":[76],"establishes":[78],"initial":[79],"benchmarks":[80],"low-resource":[85,183],"linguistic":[86],"contexts.":[87],"presented":[89],"includes":[91],"412":[92],"abstracts":[93],"Kazakh,":[97,180],"with":[99],"2,129":[100],"2,027":[102],"aspects":[103,130],"respectively":[104],"across":[105,131],"seven":[106],"categories:":[107],"following":[109],"are":[111,210],"be":[113],"considered:":[114],"AIM,":[115],"METHOD,":[116],"MATERIAL,":[117],"TASK,":[118],"TOOL,":[119],"RESULT,":[120],"USAGE.":[122],"study":[125],"analyses":[126],"distribution":[128],"four":[132],"domains":[134],"(IT,":[135],"linguistics,":[136],"medicine,":[137],"psychology)":[139],"conducts":[141],"experiments":[142],"using":[143],"multiple":[144],"methodological":[145],"classes,":[146],"including":[147],"classical":[148],"deep":[149],"learning,":[150],"contextual":[151],"Transformer":[152],"encoder":[153],"(mBERT),":[154],"multilingual":[158,170],"XLM-RoBERTa":[159],"+":[160],"CRF":[161],"architecture.":[162],"experimental":[164],"results":[165],"demonstrate":[166],"efficacy":[168],"models":[171,207],"performing":[173],"zero-shot":[174],"between":[177],"even":[181],"conditions.":[184],"Future":[185],"will":[187],"focus":[188],"on":[189],"optimisation":[191],"tokenisation":[193],"exploration":[196],"semi-supervised":[198],"approaches":[199],"further":[201],"enhance":[202],"model":[203],"performance.":[204],"resulting":[206],"available":[211],"at":[212],"https://github.com/nikitashvarts/scimdix_aspect_extraction":[213],"can":[215],"support":[216],"downstream":[217],"applications":[218],"such":[219],"as":[220],"automatic":[221],"metadata":[222],"generation,":[223],"construction":[224],"knowledge":[227],"graphs,":[228],"domain-specific":[230],"retrieval.":[232]},"counts_by_year":[],"updated_date":"2026-03-22T06:25:25.174409","created_date":"2026-03-21T00:00:00"}
