{"id":"https://openalex.org/W7160523724","doi":"https://doi.org/10.48550/arxiv.2605.05104","title":"Building informative materials datasets beyond targeted objectives","display_name":"Building informative materials datasets beyond targeted objectives","publication_year":2026,"publication_date":"2026-05-06","ids":{"openalex":"https://openalex.org/W7160523724","doi":"https://doi.org/10.48550/arxiv.2605.05104"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05104","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110824192","display_name":"Rafael Casta\u00f1eda","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Casta\u00f1eda, Rafael Espinosa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135589229","display_name":"Ashley Dale","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dale, Ashley","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115595083","display_name":"Hongchen Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hongchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135607447","display_name":"Yonatan Kurniawan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kurniawan, Yonatan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135598218","display_name":"Hao Wan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wan, Hao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135630818","display_name":"Runze Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Runze","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063448291","display_name":"Adji Bousso Dieng","orcid":"https://orcid.org/0000-0001-5687-3554"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dieng, Adji Bousso","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135571680","display_name":"Kangming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Kangming","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135571778","display_name":"Jason Hattrick-Simpers","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hattrick-Simpers, Jason","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Healthcare and Education","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.00019999999494757503,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.6126999855041504},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5644000172615051},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.510699987411499},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.4185999929904938},{"id":"https://openalex.org/keywords/data-collection","display_name":"Data collection","score":0.39430001378059387}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6917999982833862},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.6126999855041504},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5644000172615051},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5236999988555908},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.510699987411499},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.4185999929904938},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.41040000319480896},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.4097000062465668},{"id":"https://openalex.org/C133462117","wikidata":"https://www.wikidata.org/wiki/Q4929239","display_name":"Data collection","level":2,"score":0.39430001378059387},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.3864000141620636},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2847000062465668},{"id":"https://openalex.org/C2781316041","wikidata":"https://www.wikidata.org/wiki/Q1230584","display_name":"Diversity (politics)","level":2,"score":0.2669999897480011}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05104","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05104","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Materials":[0],"science":[1],"data":[2,40],"collection":[3,41],"can":[4,104,129],"be":[5],"expensive,":[6],"making":[7],"the":[8,84,161,176],"reuse":[9],"and":[10,184,191,198],"long-term":[11],"utility":[12],"of":[13,27,37,66,83,120,147],"datasets":[14,45,178],"critical":[15],"important":[16],"for":[17,48,57,63,160,169],"future":[18,49,171],"discovery":[19,199],"campaigns.":[20,200],"In":[21,87],"practice,":[22],"researchers":[23],"prioritize":[24],"a":[25,35,55,174],"subset":[26,36],"properties":[28,65,103],"due":[29],"to":[30,79,108,111,122,133,138,149],"research":[31],"interests.":[32],"However,":[33],"ignoring":[34],"outcomes":[38],"in":[39,195],"campaigns":[42],"potentially":[43],"generate":[44],"poorly":[46],"suited":[47],"learning":[50],"tasks.":[51],"Here,":[52],"we":[53,92],"present":[54],"framework":[56,117,144],"dataset":[58,90,154],"construction":[59,155],"that":[60,94],"maximizes":[61],"informativeness":[62,159],"target":[64],"interest":[67],"while":[68,142],"preserving":[69],"performance":[70,100,128],"on":[71,101],"untargeted":[72,102],"ones.":[73],"Our":[74],"approach":[75],"uses":[76],"diversity-aware":[77,97],"selection":[78],"ensure":[80],"broad":[81],"coverage":[82,168],"materials":[85,167],"space.":[86],"noisy":[88],"experimental":[89],"construction,":[91],"find":[93],"without":[95,140],"our":[96,116,143],"framework,":[98],"prediction":[99],"degrade":[105,130],"by":[106,136],"up":[107,121,137,148],"40%":[109],"relative":[110],"random":[112,134],"sampling,":[113],"whereas":[114],"applying":[115],"yields":[118],"improvements":[119],"10%":[123],".":[124],"For":[125],"targeted":[126,162],"properties,":[127,163],"with":[131],"respect":[132],"sampling":[135],"12.5%":[139],"diversity,":[141],"achieves":[145],"gains":[146],"25%.":[150],"Incorporating":[151],"diversity":[152],"into":[153],"not":[156],"only":[157],"preserves":[158],"but":[164],"also":[165],"improves":[166],"potential":[170],"objectives.":[172],"As":[173],"result,":[175],"constructed":[177],"remain":[179],"broadly":[180],"informative":[181],"across":[182],"considered":[183],"unconsidered":[185],"outcomes,":[186],"ensuring":[187],"unbiased":[188],"quality":[189],"entries":[190],"mitigating":[192],"cold-start":[193],"limitations":[194],"subsequent":[196],"modeling":[197]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-08T00:00:00"}
