{"id":"https://openalex.org/W7151404253","doi":"https://doi.org/10.48550/arxiv.2604.04168","title":"A Semi-Automated Annotation Workflow for Paediatric Histopathology Reports Using Small Language Models","display_name":"A Semi-Automated Annotation Workflow for Paediatric Histopathology Reports Using Small Language Models","publication_year":2026,"publication_date":"2026-04-05","ids":{"openalex":"https://openalex.org/W7151404253","doi":"https://doi.org/10.48550/arxiv.2604.04168"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.04168","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04168","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.04168","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133109063","display_name":"Avish Vijayaraghavan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Vijayaraghavan, Avish","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115913118","display_name":"Jaskaran Singh Kawatra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kawatra, Jaskaran Singh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093676690","display_name":"Sebin Sabu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sabu, Sebin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133092652","display_name":"Jonny Sheldon","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sheldon, Jonny","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133121615","display_name":"Will Poulett","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Poulett, Will","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123904071","display_name":"Alex Eze","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Eze, Alex","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025962709","display_name":"Daniel Key","orcid":"https://orcid.org/0000-0002-9559-1784"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Key, Daniel","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038122440","display_name":"John Booth","orcid":"https://orcid.org/0000-0003-4357-1324"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Booth, John","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123990389","display_name":"Shiren Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Shiren","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133123741","display_name":"Jonny Pearson","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pearson, Jonny","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5094350112","display_name":"Dan Schofield","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schofield, Dan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133074735","display_name":"Jonathan Hope","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hope, Jonathan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133069740","display_name":"Pavithra Rajendran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajendran, Pavithra","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123921271","display_name":"Neil Sebire","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sebire, Neil","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":14,"corresponding_author_ids":["https://openalex.org/A5133109063"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.4456999897956848,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.4456999897956848,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.2888999879360199,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.0494999997317791,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.8396999835968018},{"id":"https://openalex.org/keywords/information-extraction","display_name":"Information extraction","score":0.6694999933242798},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5601999759674072},{"id":"https://openalex.org/keywords/protected-health-information","display_name":"Protected health information","score":0.5490000247955322},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.48739999532699585},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.4643000066280365},{"id":"https://openalex.org/keywords/health-informatics","display_name":"Health informatics","score":0.4171999990940094},{"id":"https://openalex.org/keywords/data-extraction","display_name":"Data extraction","score":0.40549999475479126},{"id":"https://openalex.org/keywords/unstructured-data","display_name":"Unstructured data","score":0.38510000705718994}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.8396999835968018},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7524999976158142},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.6694999933242798},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5601999759674072},{"id":"https://openalex.org/C133652896","wikidata":"https://www.wikidata.org/wiki/Q7251300","display_name":"Protected health information","level":5,"score":0.5490000247955322},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4909000098705292},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.48739999532699585},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.4643000066280365},{"id":"https://openalex.org/C145642194","wikidata":"https://www.wikidata.org/wiki/Q870895","display_name":"Health informatics","level":3,"score":0.4171999990940094},{"id":"https://openalex.org/C2777466982","wikidata":"https://www.wikidata.org/wiki/Q5227287","display_name":"Data extraction","level":3,"score":0.40549999475479126},{"id":"https://openalex.org/C2781252014","wikidata":"https://www.wikidata.org/wiki/Q1141900","display_name":"Unstructured data","level":3,"score":0.38510000705718994},{"id":"https://openalex.org/C2779135771","wikidata":"https://www.wikidata.org/wiki/Q403574","display_name":"Named-entity recognition","level":3,"score":0.38019999861717224},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3714999854564667},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3693000078201294},{"id":"https://openalex.org/C2778012447","wikidata":"https://www.wikidata.org/wiki/Q1034415","display_name":"Scope (computer science)","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.349700003862381},{"id":"https://openalex.org/C91632574","wikidata":"https://www.wikidata.org/wiki/Q15088675","display_name":"Data curation","level":2,"score":0.34130001068115234},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3305000066757202},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3240000009536743},{"id":"https://openalex.org/C534262118","wikidata":"https://www.wikidata.org/wiki/Q177719","display_name":"Medical diagnosis","level":2,"score":0.299699991941452},{"id":"https://openalex.org/C20136886","wikidata":"https://www.wikidata.org/wiki/Q749647","display_name":"Interoperability","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C105002631","wikidata":"https://www.wikidata.org/wiki/Q4833645","display_name":"Subject-matter expert","level":3,"score":0.29010000824928284},{"id":"https://openalex.org/C180198813","wikidata":"https://www.wikidata.org/wiki/Q121182","display_name":"Information system","level":2,"score":0.2831999957561493},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.2736999988555908},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.27239999175071716},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2721000015735626},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.27079999446868896},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C21338462","wikidata":"https://www.wikidata.org/wiki/Q1662581","display_name":"Information model","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C69505689","wikidata":"https://www.wikidata.org/wiki/Q455338","display_name":"Unified Medical Language System","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.04168","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04168","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.04168","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.04168","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.5909237861633301}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Electronic":[0],"Patient":[1],"Record":[2],"(EPR)":[3],"systems":[4],"contain":[5],"valuable":[6],"clinical":[7,43,114,174,232],"information,":[8],"but":[9,32],"much":[10],"of":[11,126],"it":[12],"is":[13,243],"trapped":[14],"in":[15],"unstructured":[16,76],"text,":[17],"limiting":[18],"its":[19,100],"use":[20],"for":[21,99,173],"research":[22],"and":[23,40,104,158,195,208],"decision-making.":[24],"Large":[25],"language":[26,68],"models":[27,69,187],"can":[28,226],"extract":[29,72,227],"such":[30],"information":[31,74,141,229],"require":[33],"substantial":[34],"computational":[35],"resources":[36],"to":[37,45,71,91,170],"run":[38],"locally,":[39],"sending":[41],"sensitive":[42],"data":[44],"cloud-based":[46],"services,":[47],"even":[48],"when":[49,219],"deidentified,":[50],"raises":[51],"significant":[52],"patient":[53],"privacy":[54],"concerns.":[55],"In":[56],"this":[57],"study,":[58],"we":[59,87],"develop":[60,109],"a":[61,85,96,124,134,150,166],"resource-efficient":[62],"semi-automated":[63],"annotation":[64],"workflow":[65,90,111],"using":[66,144],"small":[67],"(SLMs)":[70],"structured":[73,228],"from":[75,123,230],"EPR":[77],"data,":[78],"focusing":[79],"on":[80,234],"paediatric":[81,92],"histopathology":[82],"reports.":[83],"As":[84],"proof-of-concept,":[86],"apply":[88],"the":[89,110,180,205],"renal":[93],"biopsy":[94],"reports,":[95],"domain":[97],"chosen":[98],"constrained":[101],"diagnostic":[102],"scope":[103],"well-defined":[105],"underlying":[106],"biology.":[107],"We":[108,146],"iteratively":[112],"with":[113,165,237],"oversight":[115],"across":[116],"three":[117],"meetings,":[118],"manually":[119],"annotating":[120],"400":[121],"reports":[122,172],"dataset":[125],"2,111":[127],"at":[128,183,245],"Great":[129],"Ormond":[130],"Street":[131],"Hospital":[132],"as":[133,149],"gold":[135],"standard,":[136],"while":[137],"developing":[138],"an":[139],"automated":[140],"extraction":[142,148],"approach":[143],"SLMs.":[145],"frame":[147],"Question-Answering":[151],"task":[152],"grounded":[153],"by":[154,202,211],"clinician-guided":[155],"entity":[156],"guidelines":[157,199],"few-shot":[159,209],"examples,":[160],"evaluating":[161],"five":[162],"instruction-tuned":[163],"SLMs":[164,225],"disagreement":[167],"modelling":[168],"framework":[169],"prioritise":[171],"review.":[175],"Gemma":[176],"2":[177],"2B":[178],"achieves":[179],"highest":[181],"accuracy":[182],"84.3%,":[184],"outperforming":[185],"off-the-shelf":[186],"including":[188],"spaCy":[189],"(74.3%),":[190],"BioBERT-SQuAD":[191],"(62.3%),":[192],"RoBERTa-SQuAD":[193],"(59.7%),":[194],"GLiNER":[196],"(60.2%).":[197],"Entity":[198],"improved":[200],"performance":[201],"7-19%":[203],"over":[204],"zero-shot":[206],"baseline,":[207],"examples":[210],"6-38%,":[212],"though":[213],"their":[214],"benefits":[215],"do":[216],"not":[217],"compound":[218],"combined.":[220],"These":[221],"results":[222],"demonstrate":[223],"that":[224],"specialised":[231],"domains":[233],"CPU-only":[235],"infrastructure":[236],"minimal":[238],"clinician":[239],"involvement.":[240],"Our":[241],"code":[242],"available":[244],"https://github.com/gosh-dre/nlp_renal_biopsy.":[246]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-04-08T00:00:00"}
