{"id":"https://openalex.org/W7128552104","doi":"https://doi.org/10.48550/arxiv.2602.08849","title":"Cutting Through the Noise: On-the-fly Outlier Detection for Robust Training of Machine Learning Interatomic Potentials","display_name":"Cutting Through the Noise: On-the-fly Outlier Detection for Robust Training of Machine Learning Interatomic Potentials","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7128552104","doi":"https://doi.org/10.48550/arxiv.2602.08849"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.08849","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5125485319","display_name":"Terry C. W. Lam","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lam, Terry C. W.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5125578506","display_name":"Niamh O'Neill","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"O'Neill, Niamh","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026599330","display_name":"Christoph Schran","orcid":"https://orcid.org/0000-0003-4595-5073"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schran, Christoph","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5122801315","display_name":"Lars L. Schaaf","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schaaf, Lars L.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9944000244140625,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10211","display_name":"Computational Drug Discovery Methods","score":0.0017000000225380063,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10002","display_name":"Advanced Chemical Physics Studies","score":0.0003000000142492354,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/overfitting","display_name":"Overfitting","score":0.6836000084877014},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.5954999923706055},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4625999927520752},{"id":"https://openalex.org/keywords/extrapolation","display_name":"Extrapolation","score":0.4375},{"id":"https://openalex.org/keywords/noise","display_name":"Noise (video)","score":0.4244999885559082},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.3984000086784363},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.3763999938964844},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.35830000042915344}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7279999852180481},{"id":"https://openalex.org/C22019652","wikidata":"https://www.wikidata.org/wiki/Q331309","display_name":"Overfitting","level":3,"score":0.6836000084877014},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6674000024795532},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6363000273704529},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.5954999923706055},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4625999927520752},{"id":"https://openalex.org/C132459708","wikidata":"https://www.wikidata.org/wiki/Q744069","display_name":"Extrapolation","level":2,"score":0.4375},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.4244999885559082},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3984000086784363},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3978999853134155},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.3763999938964844},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.35830000042915344},{"id":"https://openalex.org/C60478076","wikidata":"https://www.wikidata.org/wiki/Q3036835","display_name":"Reference data","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C101738243","wikidata":"https://www.wikidata.org/wiki/Q786435","display_name":"Autoencoder","level":3,"score":0.32919999957084656},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.3149999976158142},{"id":"https://openalex.org/C132094186","wikidata":"https://www.wikidata.org/wiki/Q641585","display_name":"Clutter","level":3,"score":0.3138999938964844},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.31279999017715454},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.3125999867916107},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.30570000410079956},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.30169999599456787},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C74883015","wikidata":"https://www.wikidata.org/wiki/Q290467","display_name":"Autoregressive\u2013moving-average model","level":3,"score":0.2515999972820282}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.08849","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.08849","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.08849","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.08849","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"score":0.6195310950279236,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0,116],"accuracy":[1],"of":[2,39,108,161],"machine":[3],"learning":[4],"interatomic":[5],"potentials":[6],"suffers":[7],"from":[8,17,129],"reference":[9,75,131],"data":[10],"that":[11,67,99],"contains":[12],"numerical":[13],"noise.":[14],"Often":[15],"originating":[16],"unconverged":[18,130],"or":[19,36,46],"inconsistent":[20],"electronic-structure":[21],"calculations,":[22],"this":[23,87,100],"noise":[24],"is":[25,119],"challenging":[26],"to":[27,54,56],"identify.":[28],"Existing":[29],"mitigation":[30],"strategies":[31],"such":[32],"as":[33],"manual":[34],"filtering":[35],"iterative":[37,109],"refinement":[38,110],"outliers,":[40],"require":[41],"either":[42],"substantial":[43],"expert":[44],"effort":[45],"multiple":[47],"expensive":[48],"retraining":[49],"cycles,":[50],"making":[51],"them":[52],"difficult":[53],"scale":[55],"large":[57],"datasets.":[58],"Here,":[59],"we":[60,137],"introduce":[61],"an":[62,83],"on-the-fly":[63],"outlier":[64],"detection":[65],"scheme":[66],"automatically":[68],"down-weights":[69],"noisy":[70],"samples,":[71],"without":[72],"requiring":[73],"additional":[74],"calculations.":[76],"By":[77],"tracking":[78],"the":[79,106,150],"loss":[80],"distribution":[81],"via":[82],"exponential":[84],"moving":[85],"average,":[86],"unsupervised":[88],"method":[89],"identifies":[90],"outliers":[91],"throughout":[92],"a":[93,143,159,166],"single":[94],"training":[95,142,171],"run.":[96],"We":[97],"show":[98],"approach":[101],"prevents":[102],"overfitting":[103],"and":[104],"matches":[105],"performance":[107],"baselines":[111],"with":[112],"significantly":[113],"reduced":[114],"overhead.":[115],"method's":[117],"effectiveness":[118],"demonstrated":[120],"by":[121,141,158],"recovering":[122],"accurate":[123],"physical":[124],"observables":[125],"for":[126,146,170],"liquid":[127],"water":[128],"data,":[132],"including":[133],"diffusion":[134],"coefficients.":[135],"Furthermore,":[136],"validate":[138],"its":[139],"scalability":[140],"foundation":[144],"model":[145],"organic":[147],"chemistry":[148],"on":[149,174],"SPICE":[151],"dataset,":[152],"where":[153],"it":[154],"reduces":[155],"energy":[156],"errors":[157],"factor":[160],"three.":[162],"This":[163],"framework":[164],"provides":[165],"simple,":[167],"automated":[168],"solution":[169],"robust":[172],"models":[173],"imperfect":[175],"datasets":[176],"across":[177],"dataset":[178],"sizes.":[179]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-02-11T00:00:00"}
