{"id":"https://openalex.org/W7131403241","doi":"https://doi.org/10.48550/arxiv.2602.20336","title":"Natural Language Processing Models for Robust Document Categorization","display_name":"Natural Language Processing Models for Robust Document Categorization","publication_year":2026,"publication_date":"2026-02-23","ids":{"openalex":"https://openalex.org/W7131403241","doi":"https://doi.org/10.48550/arxiv.2602.20336"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.20336","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126786618","display_name":"Radoslaw Roszczyk","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Roszczyk, Radoslaw","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126815300","display_name":"Pawel Tecza","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tecza, Pawel","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126806195","display_name":"Maciej Stodolski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stodolski, Maciej","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126803934","display_name":"Krzysztof Siwek","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Siwek, Krzysztof","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5126786618"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.8529000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.8529000282287598,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.03920000046491623,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.01269999984651804,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/categorization","display_name":"Categorization","score":0.6574000120162964},{"id":"https://openalex.org/keywords/naive-bayes-classifier","display_name":"Naive Bayes classifier","score":0.6384999752044678},{"id":"https://openalex.org/keywords/automation","display_name":"Automation","score":0.6043999791145325},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5181999802589417},{"id":"https://openalex.org/keywords/machine-translation","display_name":"Machine translation","score":0.3776000142097473},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.37689998745918274},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.3425999879837036}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7533000111579895},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.65829998254776},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.6574000120162964},{"id":"https://openalex.org/C52001869","wikidata":"https://www.wikidata.org/wiki/Q812530","display_name":"Naive Bayes classifier","level":3,"score":0.6384999752044678},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.6043999791145325},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5993000268936157},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5181999802589417},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.3776000142097473},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.37689998745918274},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3425999879837036},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3384999930858612},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.31220000982284546},{"id":"https://openalex.org/C207201462","wikidata":"https://www.wikidata.org/wiki/Q182505","display_name":"Bayes' theorem","level":3,"score":0.3109000027179718},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.29319998621940613},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2840000092983246},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.2802000045776367},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C169258074","wikidata":"https://www.wikidata.org/wiki/Q245748","display_name":"Random forest","level":2,"score":0.26179999113082886}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.20336","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.20336","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20336","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20336","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5750388503074646,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"article":[1],"presents":[2],"an":[3],"evaluation":[4],"of":[5,18,51,130,149,166,199],"several":[6],"machine":[7],"learning":[8],"methods":[9],"applied":[10],"to":[11,121,125,159],"automated":[12,164],"text":[13],"classification,":[14],"alongside":[15],"the":[16,81,123,128,134,147,181,186],"design":[17],"a":[19,38,56,60,65,101],"demonstrative":[20,155],"system":[21,156],"for":[22,185,193],"unbalanced":[23],"document":[24],"categorization":[25],"and":[26,64,93,113,196],"distribution.":[27],"The":[28,72,97,175],"study":[29,176],"focuses":[30],"on":[31,127],"balancing":[32],"classification":[33],"accuracy":[34,107],"with":[35,169],"computational":[36,95],"efficiency,":[37],"key":[39],"consideration":[40],"when":[41],"integrating":[42],"AI":[43],"into":[44],"real":[45],"world":[46],"automation":[47],"pipelines.":[48],"Three":[49],"models":[50],"varying":[52],"complexity":[53],"were":[54],"examined:":[55],"Naive":[57,118],"Bayes":[58,119],"classifier,":[59],"bidirectional":[61],"LSTM":[62],"network,":[63],"fine":[66],"tuned":[67],"transformer":[68,200],"based":[69],"BERT":[70,79],"model.":[71],"experiments":[73],"reveal":[74],"substantial":[75],"differences":[76],"in":[77,146],"performance.":[78],"achieved":[80],"highest":[82],"accuracy,":[83,136],"consistently":[84],"exceeding":[85],"99\\%,":[86],"but":[87],"required":[88],"significantly":[89],"longer":[90],"training":[91,111],"times":[92],"greater":[94],"resources.":[96],"BiLSTM":[98,179],"model":[99],"provided":[100],"strong":[102],"compromise,":[103],"reaching":[104],"approximately":[105],"98.56\\%":[106],"while":[108,189],"maintaining":[109],"moderate":[110],"costs":[112],"offering":[114],"robust":[115],"contextual":[116],"understanding.":[117],"proved":[120],"be":[122],"fastest":[124],"train,":[126],"order":[129],"milliseconds,":[131],"yet":[132],"delivered":[133],"lowest":[135],"averaging":[137],"around":[138],"94.5\\%.":[139],"Class":[140],"imbalance":[141],"influenced":[142],"all":[143],"methods,":[144],"particularly":[145],"recognition":[148],"minority":[150],"categories.":[151],"A":[152],"fully":[153],"functional":[154],"was":[157],"implemented":[158],"validate":[160],"practical":[161],"applicability,":[162],"enabling":[163],"routing":[165],"technical":[167],"requests":[168],"throughput":[170],"unattainable":[171],"through":[172],"manual":[173],"processing.":[174],"concludes":[177],"that":[178],"offers":[180],"most":[182],"balanced":[183],"solution":[184],"examined":[187],"scenario,":[188],"also":[190],"outlining":[191],"opportunities":[192],"future":[194],"improvements":[195],"further":[197],"exploration":[198],"architectures.":[201]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-26T00:00:00"}
