{"id":"https://openalex.org/W4415338080","doi":"https://doi.org/10.48550/arxiv.2509.25346","title":"SynthPert: Enhancing LLM Biological Reasoning via Synthetic Reasoning Traces for Cellular Perturbation Prediction","display_name":"SynthPert: Enhancing LLM Biological Reasoning via Synthetic Reasoning Traces for Cellular Perturbation Prediction","publication_year":2025,"publication_date":"2025-09-29","ids":{"openalex":"https://openalex.org/W4415338080","doi":"https://doi.org/10.48550/arxiv.2509.25346"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.25346","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25346","pdf_url":"https://arxiv.org/pdf/2509.25346","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2509.25346","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016040787","display_name":"Lawrence Phillips","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Phillips, Lawrence","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052544681","display_name":"Marc Boubnovski Martell","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Martell, Marc Boubnovski","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101554186","display_name":"Aditya Misra","orcid":"https://orcid.org/0000-0003-2389-1275"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Misra, Aditya","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5119204116","display_name":"Josefa Lia Stoisser","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Stoisser, Josefa Lia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032827197","display_name":"Cesar A. Prada\u2010Medina","orcid":"https://orcid.org/0000-0003-1939-2297"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Prada-Medina, Cesar A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083377509","display_name":"Rory Donovan-Maiye","orcid":"https://orcid.org/0000-0001-8080-9771"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Donovan-Maiye, Rory","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5062738925","display_name":"Kaspar M\u00e4rtens","orcid":"https://orcid.org/0000-0002-7631-727X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"M\u00e4rtens, Kaspar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5016040787"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.972100019454956,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},"topics":[{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.972100019454956,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T12254","display_name":"Machine Learning in Bioinformatics","score":0.9218000173568726,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/synthetic-data","display_name":"Synthetic data","score":0.5332000255584717},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.5224999785423279},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.41990000009536743},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.3637000024318695},{"id":"https://openalex.org/keywords/model-based-reasoning","display_name":"Model-based reasoning","score":0.33730000257492065},{"id":"https://openalex.org/keywords/biological-cell","display_name":"Biological cell","score":0.32269999384880066}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6901999711990356},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6877999901771545},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.5332000255584717},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.5224999785423279},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5044999718666077},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.41990000009536743},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.3637000024318695},{"id":"https://openalex.org/C37335422","wikidata":"https://www.wikidata.org/wiki/Q6888134","display_name":"Model-based reasoning","level":3,"score":0.33730000257492065},{"id":"https://openalex.org/C2983013280","wikidata":"https://www.wikidata.org/wiki/Q7868","display_name":"Biological cell","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C2992918937","wikidata":"https://www.wikidata.org/wiki/Q7239","display_name":"Biological organism","level":3,"score":0.3111000061035156},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C166998942","wikidata":"https://www.wikidata.org/wiki/Q16969238","display_name":"Interactor","level":2,"score":0.2718000113964081},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C89288958","wikidata":"https://www.wikidata.org/wiki/Q7301504","display_name":"Reasoning system","level":2,"score":0.2685000002384186},{"id":"https://openalex.org/C159032336","wikidata":"https://www.wikidata.org/wiki/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2632000148296356}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.25346","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25346","pdf_url":"https://arxiv.org/pdf/2509.25346","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2509.25346","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2509.25346","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.25346","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.25346","pdf_url":"https://arxiv.org/pdf/2509.25346","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Predicting":[0],"cellular":[1],"responses":[2],"to":[3,34,40,45],"genetic":[4],"perturbations":[5],"represents":[6],"a":[7,52],"fundamental":[8],"challenge":[9],"in":[10,42,154],"systems":[11],"biology,":[12],"critical":[13],"for":[14,29,150],"advancing":[15],"therapeutic":[16],"discovery":[17],"and":[18,128],"virtual":[19],"cell":[20],"modeling.":[21],"While":[22],"large":[23],"language":[24],"models":[25],"(LLMs)":[26],"show":[27],"promise":[28],"biological":[30,109],"reasoning,":[31],"their":[32],"application":[33],"perturbation":[35],"prediction":[36],"remains":[37],"underexplored":[38],"due":[39],"challenges":[41],"adapting":[43],"them":[44],"structured":[46],"experimental":[47],"data.":[48,96,140],"We":[49],"present":[50],"SynthPert,":[51],"novel":[53],"method":[54],"that":[55,76,92],"enhances":[56],"LLM":[57],"performance":[58,83],"through":[59],"supervised":[60],"fine-tuning":[61],"on":[62,124],"synthetic":[63,147],"reasoning":[64,105,148,153],"traces":[65,106],"generated":[66,93],"by":[67],"frontier":[68,90],"models.":[69],"Using":[70],"the":[71,86,89,94,144],"PerturbQA":[72],"benchmark,":[73],"we":[74],"demonstrate":[75],"our":[77],"approach":[78,117],"not":[79],"only":[80,135],"achieves":[81],"state-of-the-art":[82],"but":[84],"surpasses":[85],"capabilities":[87],"of":[88,137,146],"model":[91],"training":[95,139],"Our":[97],"results":[98],"reveal":[99],"three":[100],"key":[101],"insights:":[102],"(1)":[103],"Synthetic":[104],"effectively":[107],"distill":[108],"knowledge":[110],"even":[111],"when":[112],"partially":[113],"inaccurate,":[114],"(2)":[115],"This":[116,141],"enables":[118],"cross-cell-type":[119],"generalization":[120],"with":[121],"87%":[122],"accuracy":[123],"unseen":[125],"RPE1":[126],"cells,":[127],"(3)":[129],"Performance":[130],"gains":[131],"persist":[132],"despite":[133],"using":[134],"2%":[136],"quality-filtered":[138],"work":[142],"shows":[143],"effectiveness":[145],"distillation":[149],"enhancing":[151],"domain-specific":[152],"LLMs.":[155]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-19T00:00:00"}
