{"id":"https://openalex.org/W7138397554","doi":"https://doi.org/10.48550/arxiv.2603.15276","title":"Dataset Diversity Metrics and Impact on Classification Models","display_name":"Dataset Diversity Metrics and Impact on Classification Models","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138397554","doi":"https://doi.org/10.48550/arxiv.2603.15276"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.15276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.15276","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5092486744","display_name":"Th\u00e9o Sourget","orcid":"https://orcid.org/0009-0005-0220-9590"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sourget, Th\u00e9o","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129681757","display_name":"Niclas Cla\u00dfen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cla\u00dfen, Niclas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129702202","display_name":"Jack Junchi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Jack Junchi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129672369","display_name":"Rob van der Goot","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"van der Goot, Rob","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129658316","display_name":"Veronika Cheplygina","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheplygina, Veronika","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5092486744"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11775","display_name":"COVID-19 diagnosis using AI","score":0.32010000944137573,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T11775","display_name":"COVID-19 diagnosis using AI","score":0.32010000944137573,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.14569999277591705,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T12422","display_name":"Radiomics and Machine Learning in Medical Imaging","score":0.1412000060081482,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.7195000052452087},{"id":"https://openalex.org/keywords/diversity","display_name":"Diversity (politics)","score":0.6147000193595886},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.5164999961853027},{"id":"https://openalex.org/keywords/intuition","display_name":"Intuition","score":0.5001999735832214},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.43140000104904175},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.36480000615119934}],"concepts":[{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.7195000052452087},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6636999845504761},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.6147000193595886},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.5164999961853027},{"id":"https://openalex.org/C132010649","wikidata":"https://www.wikidata.org/wiki/Q189222","display_name":"Intuition","level":2,"score":0.5001999735832214},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.48539999127388},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46549999713897705},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.43140000104904175},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41760000586509705},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.39340001344680786},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.36480000615119934},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2842000126838684},{"id":"https://openalex.org/C2983685735","wikidata":"https://www.wikidata.org/wiki/Q5227355","display_name":"Data source","level":2,"score":0.27000001072883606},{"id":"https://openalex.org/C94124525","wikidata":"https://www.wikidata.org/wiki/Q912550","display_name":"Categorization","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2578999996185303},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.25690001249313354},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.25040000677108765}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.15276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.15276","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.15276","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,174],"diversity":[1,21,39,57,129,140,154],"of":[2,20,37,54,94,114,153,163],"training":[3,112,168],"datasets":[4],"is":[5,22,40,180],"usually":[6],"perceived":[7],"as":[8],"an":[9],"important":[10],"aspect":[11],"to":[12,166,171],"obtain":[13],"a":[14,66,74,95],"robust":[15],"model.":[16],"However,":[17,157],"the":[18,35,52,92,111,115,122,135,138,143,150,161,167],"definition":[19],"often":[23,41],"not":[24],"defined":[25],"or":[26,126],"differs":[27],"across":[28],"papers,":[29],"and":[30,62,72,107,124,137],"while":[31],"some":[32],"metrics":[33,58,84],"exist,":[34],"quantification":[36],"this":[38,48,178],"overlooked":[42],"when":[43],"developing":[44],"new":[45],"algorithms.":[46],"In":[47],"work,":[49],"we":[50,158],"study":[51,179],"behaviour":[53],"multiple":[55],"dataset":[56,68],"for":[59],"image,":[60],"text":[61],"metadata":[63,127],"using":[64],"MorphoMNIST,":[65],"toy":[67],"with":[69,86,91,104,134],"controlled":[70],"perturbations,":[71],"PadChest,":[73],"publicly":[75],"available":[76,181],"chest":[77],"X-ray":[78],"dataset.":[79],"We":[80,98,117],"evaluate":[81],"whether":[82,101],"these":[83],"correlate":[85,103],"each":[87],"other":[88],"but":[89,131],"also":[90,99],"intuition":[93],"clinical":[96,144],"expert.":[97],"assess":[100],"they":[102,109],"downstream-task":[105],"performance":[106],"how":[108],"impact":[110],"dynamic":[113],"models.":[116],"find":[118,159],"limited":[119],"correlations":[120,133],"between":[121],"AUC":[123],"image":[125],"reference-free":[128],"metrics,":[130],"higher":[132],"FID":[136],"semantic":[139],"metrics.":[141],"Finally,":[142],"expert":[145],"indicates":[146],"that":[147,160],"scanners":[148],"are":[149],"main":[151],"source":[152],"in":[155,177],"practice.":[156],"addition":[162],"another":[164],"scanner":[165],"set":[169],"leads":[170],"shortcut":[172],"learning.":[173],"code":[175],"used":[176],"at":[182],"https://github.com/TheoSourget/dataset_diversity_evaluation":[183]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
