{"id":"https://openalex.org/W7083699160","doi":"https://doi.org/10.48550/arxiv.2509.21647","title":"Automated Machine Learning Pipeline: Large Language Models-Assisted Automated Dataset Generation for Training Machine-Learned Interatomic Potentials","display_name":"Automated Machine Learning Pipeline: Large Language Models-Assisted Automated Dataset Generation for Training Machine-Learned Interatomic Potentials","publication_year":2025,"publication_date":"2025-09-25","ids":{"openalex":"https://openalex.org/W7083699160","doi":"https://doi.org/10.48550/arxiv.2509.21647"},"language":"en","primary_location":{"id":"doi:10.48550/arxiv.2509.21647","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.21647","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2509.21647","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Lahouari, Adam","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Lahouari, Adam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Rogal, Jutta","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rogal, Jutta","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Tuckerman, Mark E.","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tuckerman, Mark E.","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9939000010490417,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11804","display_name":"Quantum many-body systems","score":0.002199999988079071,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10002","display_name":"Advanced Chemical Physics Studies","score":0.00039999998989515007,"subfield":{"id":"https://openalex.org/subfields/3107","display_name":"Atomic and Molecular Physics, and Optics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5318999886512756},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.5253999829292297},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.5175999999046326},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.4805000126361847},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.45089998841285706},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.44429999589920044},{"id":"https://openalex.org/keywords/molecular-dynamics","display_name":"Molecular dynamics","score":0.4104999899864197},{"id":"https://openalex.org/keywords/interatomic-potential","display_name":"Interatomic potential","score":0.40310001373291016}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6862999796867371},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6419000029563904},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5975000262260437},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5318999886512756},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.5253999829292297},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.5175999999046326},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.4805000126361847},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.45089998841285706},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.44429999589920044},{"id":"https://openalex.org/C59593255","wikidata":"https://www.wikidata.org/wiki/Q901663","display_name":"Molecular dynamics","level":2,"score":0.4104999899864197},{"id":"https://openalex.org/C2776372370","wikidata":"https://www.wikidata.org/wiki/Q3399989","display_name":"Interatomic potential","level":3,"score":0.40310001373291016},{"id":"https://openalex.org/C111219384","wikidata":"https://www.wikidata.org/wiki/Q6954384","display_name":"NIST","level":2,"score":0.38440001010894775},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.3813999891281128},{"id":"https://openalex.org/C28556851","wikidata":"https://www.wikidata.org/wiki/Q1077753","display_name":"Canonical ensemble","level":3,"score":0.36230000853538513},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.33629998564720154},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30640000104904175},{"id":"https://openalex.org/C2778241615","wikidata":"https://www.wikidata.org/wiki/Q83303","display_name":"Fortran","level":2,"score":0.3050999939441681},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.28839999437332153},{"id":"https://openalex.org/C190390380","wikidata":"https://www.wikidata.org/wiki/Q62505","display_name":"Physics engine","level":2,"score":0.2858999967575073},{"id":"https://openalex.org/C58312451","wikidata":"https://www.wikidata.org/wiki/Q4817200","display_name":"Atom (system on chip)","level":2,"score":0.25619998574256897},{"id":"https://openalex.org/C203005215","wikidata":"https://www.wikidata.org/wiki/Q79798","display_name":"Machine translation","level":2,"score":0.2554999887943268}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2509.21647","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.21647","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2509.21647","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.21647","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.42960262298583984,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Machine":[0,55],"learning":[1],"interatomic":[2],"potentials":[3],"(MLIPs)":[4],"have":[5],"become":[6],"powerful":[7],"tools":[8],"to":[9,67,74],"extend":[10],"molecular":[11,97,149],"simulations":[12,151],"beyond":[13],"the":[14,61,104,153],"limits":[15],"of":[16,96,117,124],"quantum":[17],"methods,":[18],"offering":[19],"near-quantum":[20],"accuracy":[21,144],"at":[22],"much":[23],"lower":[24],"computational":[25],"cost.":[26],"Yet,":[27],"developing":[28],"reliable":[29],"MLIPs":[30],"remains":[31],"difficult":[32],"because":[33],"it":[34],"requires":[35],"generating":[36],"high-quality":[37],"datasets,":[38],"preprocessing":[39],"atomic":[40],"structures,":[41],"and":[42,45,82,107,129,145,155],"carefully":[43],"training":[44],"validating":[46],"models.":[47],"In":[48],"this":[49],"work,":[50],"we":[51],"introduce":[52],"an":[53],"Automated":[54],"Learning":[56],"Pipeline":[57],"(AMLP)":[58],"that":[59],"unifies":[60],"entire":[62],"workflow":[63],"from":[64],"dataset":[65],"creation":[66],"model":[68],"validation.":[69],"AMLP":[70],"employs":[71],"large-language-model":[72],"agents":[73],"assist":[75],"with":[76,113,142],"electronic-structure":[77],"code":[78],"selection,":[79],"input":[80],"preparation,":[81],"output":[83],"conversion,":[84],"while":[85],"its":[86],"analysis":[87],"suite":[88],"(AMLP-Analysis),":[89],"based":[90],"on":[91,103,109],"ASE":[92],"supports":[93],"a":[94,114,118],"range":[95],"simulations.":[98],"The":[99,136],"pipeline":[100],"is":[101],"built":[102],"MACE":[105],"architecture":[106],"validated":[108],"acridine":[110],"polymorphs,":[111],"where,":[112],"straightforward":[115],"fine-tuning":[116],"foundation":[119],"model,":[120],"mean":[121],"absolute":[122],"errors":[123],"~1.7":[125],"meV/atom":[126],"in":[127,132,152],"energies":[128],"~7.0":[130],"meV/\u00c5":[131],"forces":[133],"are":[134],"achieved.":[135],"fitted":[137],"MLIP":[138],"reproduces":[139],"DFT":[140],"geometries":[141],"sub-\u00c5":[143],"demonstrates":[146],"stability":[147],"during":[148],"dynamics":[150],"microcanonical":[154],"canonical":[156],"ensembles.":[157]},"counts_by_year":[],"updated_date":"2025-12-23T23:11:35.936235","created_date":"2025-10-10T00:00:00"}
