{"id":"https://openalex.org/W7134895536","doi":"https://doi.org/10.48550/arxiv.2603.08989","title":"Automated Thematic Analysis for Clinical Qualitative Data: Iterative Codebook Refinement with Full Provenance","display_name":"Automated Thematic Analysis for Clinical Qualitative Data: Iterative Codebook Refinement with Full Provenance","publication_year":2026,"publication_date":"2026-03-09","ids":{"openalex":"https://openalex.org/W7134895536","doi":"https://doi.org/10.48550/arxiv.2603.08989"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.08989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.08989","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112055667","display_name":"Seungjun Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Yi, Seungjun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045611808","display_name":"Joakim Nguyen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nguyen, Joakim","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128737189","display_name":"Huimin Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Huimin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063224515","display_name":"Terence Lim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lim, Terence","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128756683","display_name":"Joseph Skrovan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Skrovan, Joseph","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128729329","display_name":"Mehak Beri","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Beri, Mehak","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128775744","display_name":"Hitakshi Modi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Modi, Hitakshi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054449239","display_name":"Andrew Well","orcid":"https://orcid.org/0000-0002-0725-2791"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Well, Andrew","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002938900","display_name":"Carlos M. Mery","orcid":"https://orcid.org/0000-0003-0779-2381"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mery, Carlos M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128712142","display_name":"Yan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062714971","display_name":"Mia K. Markey","orcid":"https://orcid.org/0000-0001-8186-4959"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Markey, Mia K.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128716547","display_name":"Ying Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Ying","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5112055667"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19460000097751617,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.19460000097751617,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.1356000006198883,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.10010000318288803,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/codebook","display_name":"Codebook","score":0.8385000228881836},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.6876000165939331},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.46639999747276306},{"id":"https://openalex.org/keywords/benchmarking","display_name":"Benchmarking","score":0.4462999999523163},{"id":"https://openalex.org/keywords/thematic-map","display_name":"Thematic map","score":0.40689998865127563},{"id":"https://openalex.org/keywords/automation","display_name":"Automation","score":0.3955000042915344},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.392300009727478},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.37790000438690186},{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.3612000048160553}],"concepts":[{"id":"https://openalex.org/C127759330","wikidata":"https://www.wikidata.org/wiki/Q637416","display_name":"Codebook","level":2,"score":0.8385000228881836},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6912000179290771},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.6876000165939331},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.4462999999523163},{"id":"https://openalex.org/C93692415","wikidata":"https://www.wikidata.org/wiki/Q1502030","display_name":"Thematic map","level":2,"score":0.40689998865127563},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.40630000829696655},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3986999988555908},{"id":"https://openalex.org/C115901376","wikidata":"https://www.wikidata.org/wiki/Q184199","display_name":"Automation","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.392300009727478},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.37790000438690186},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3668000102043152},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.36250001192092896},{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.3612000048160553},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.35519999265670776},{"id":"https://openalex.org/C74196892","wikidata":"https://www.wikidata.org/wiki/Q7781188","display_name":"Thematic analysis","level":3,"score":0.3490000069141388},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C164226766","wikidata":"https://www.wikidata.org/wiki/Q7293202","display_name":"Rank (graph theory)","level":2,"score":0.3368000090122223},{"id":"https://openalex.org/C81669768","wikidata":"https://www.wikidata.org/wiki/Q2359161","display_name":"Precision and recall","level":2,"score":0.3359000086784363},{"id":"https://openalex.org/C170858558","wikidata":"https://www.wikidata.org/wiki/Q1394144","display_name":"Automatic summarization","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.30250000953674316},{"id":"https://openalex.org/C89600930","wikidata":"https://www.wikidata.org/wiki/Q1423946","display_name":"Segmentation","level":2,"score":0.3009999990463257},{"id":"https://openalex.org/C159694833","wikidata":"https://www.wikidata.org/wiki/Q2321565","display_name":"Iterative method","level":2,"score":0.2919999957084656},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.28780001401901245},{"id":"https://openalex.org/C2779982483","wikidata":"https://www.wikidata.org/wiki/Q6094420","display_name":"Iterative refinement","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.2847999930381775},{"id":"https://openalex.org/C145644426","wikidata":"https://www.wikidata.org/wiki/Q169411","display_name":"Unified Modeling Language","level":3,"score":0.28369998931884766},{"id":"https://openalex.org/C2775860198","wikidata":"https://www.wikidata.org/wiki/Q7257872","display_name":"Public hospital","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.25360000133514404}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.08989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.08989","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.08989","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.46254536509513855,"id":"https://metadata.un.org/sdg/5","display_name":"Gender equality"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Thematic":[0],"analysis":[1],"(TA)":[2],"is":[3],"widely":[4],"used":[5],"in":[6,20,99],"health":[7],"research":[8],"to":[9,80],"extract":[10],"patterns":[11],"from":[12],"patient":[13],"interviews,":[14,60],"yet":[15],"manual":[16],"TA":[17,44],"faces":[18],"challenges":[19],"scalability":[21],"and":[22,36,63,102],"reproducibility.":[23],"LLM-based":[24],"automation":[25],"can":[26],"help,":[27],"but":[28],"existing":[29],"approaches":[30],"produce":[31],"codebooks":[32],"with":[33,50,92,118],"limited":[34],"generalizability":[35],"lack":[37],"analytic":[38],"auditability.":[39],"We":[40],"present":[41],"an":[42],"automated":[43],"framework":[45,67],"combining":[46],"iterative":[47],"codebook":[48],"refinement":[49,84],"full":[51],"provenance":[52],"tracking.":[53],"Evaluated":[54],"on":[55,74,89],"five":[56,77],"corpora":[57,112],"spanning":[58],"clinical":[59,111],"social":[61],"media,":[62],"public":[64],"transcripts,":[65],"the":[66,69],"achieves":[68],"highest":[70],"composite":[71],"quality":[72],"score":[73],"four":[75,90],"of":[76],"datasets":[78,91],"compared":[79],"six":[81],"baselines.":[82],"Iterative":[83],"yields":[85],"statistically":[86],"significant":[87],"improvements":[88],"large":[93],"effect":[94],"sizes,":[95],"driven":[96],"by":[97],"gains":[98],"code":[100],"reusability":[101],"distributional":[103],"consistency":[104],"while":[105],"preserving":[106],"descriptive":[107],"quality.":[108],"On":[109],"two":[110],"(pediatric":[113],"cardiology),":[114],"generated":[115],"themes":[116],"align":[117],"expert-annotated":[119],"themes.":[120]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-03-12T00:00:00"}
