{"id":"https://openalex.org/W4404261099","doi":"https://doi.org/10.48550/arxiv.2410.16682","title":"Methods of improving LLM training stability","display_name":"Methods of improving LLM training stability","publication_year":2024,"publication_date":"2024-10-22","ids":{"openalex":"https://openalex.org/W4404261099","doi":"https://doi.org/10.48550/arxiv.2410.16682"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.16682","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16682","pdf_url":"https://arxiv.org/pdf/2410.16682","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.16682","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070398306","display_name":"Oleg Rybakov\u200e","orcid":"https://orcid.org/0000-0003-4805-3083"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rybakov, Oleg","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114601860","display_name":"Mike Chrzanowski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chrzanowski, Mike","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114601861","display_name":"Peter Dykas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dykas, Peter","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036271098","display_name":"Jinze Xue","orcid":"https://orcid.org/0000-0002-4582-8034"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Jinze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5114601862","display_name":"Ben Lanir","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lanir, Ben","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5070398306"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14470","display_name":"Advanced Data Processing Techniques","score":0.836899995803833,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14470","display_name":"Advanced Data Processing Techniques","score":0.836899995803833,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13045","display_name":"Industrial Engineering and Technologies","score":0.8158000111579895,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13891","display_name":"Engineering Diagnostics and Reliability","score":0.7148000001907349,"subfield":{"id":"https://openalex.org/subfields/2211","display_name":"Mechanics of Materials"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.7462943196296692},{"id":"https://openalex.org/keywords/stability","display_name":"Stability (learning theory)","score":0.6398355960845947},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.37770047783851624},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.1924026906490326},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.11275455355644226},{"id":"https://openalex.org/keywords/meteorology","display_name":"Meteorology","score":0.05469498038291931}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.7462943196296692},{"id":"https://openalex.org/C112972136","wikidata":"https://www.wikidata.org/wiki/Q7595718","display_name":"Stability (learning theory)","level":2,"score":0.6398355960845947},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.37770047783851624},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.1924026906490326},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.11275455355644226},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.05469498038291931}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.16682","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16682","pdf_url":"https://arxiv.org/pdf/2410.16682","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.16682","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.16682","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.16682","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.16682","pdf_url":"https://arxiv.org/pdf/2410.16682","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4404261099.pdf","grobid_xml":"https://content.openalex.org/works/W4404261099.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Training":[0],"stability":[1],"of":[2,39,42,48,57,68,75,93,121],"large":[3],"language":[4,22],"models(LLMs)":[5],"is":[6,45],"an":[7,191],"important":[8],"research":[9],"topic.":[10],"Reproducing":[11],"training":[12,43,102],"instabilities":[13],"can":[14,98,179],"be":[15],"costly,":[16],"so":[17],"we":[18,109,178,200],"use":[19],"a":[20,86],"small":[21],"model":[23,106,186],"with":[24,29,85,100,167,173],"830M":[25],"parameters":[26],"and":[27,61,104,114,145],"experiment":[28],"higher":[30],"learning":[31,88,181],"rates":[32],"to":[33,36,128,190,211],"force":[34],"models":[35],"diverge.":[37],"One":[38],"the":[40,46,55,58,66,69,79,90,105,118,122,154,174,212],"sources":[41],"instability":[44],"growth":[47,120],"logits":[49,70],"in":[50,78,188,209],"attention":[51],"layers.":[52],"We":[53,82,170],"extend":[54],"focus":[56],"previous":[59],"work":[60],"look":[62],"not":[63,136],"only":[64,137],"at":[65,72],"magnitude":[67],"but":[71,141],"all":[73,94,206],"outputs":[74,97],"linear":[76,95],"layers":[77,116,140,147],"Transformer":[80],"block.":[81],"observe":[83,110,201],"that":[84,111,172],"high":[87],"rate":[89,182],"L2":[91],"norm":[92],"layer":[96,134,151,156,164,196],"grow":[99],"each":[101],"step":[103],"diverges.":[107],"Specifically":[108],"QKV,":[112],"Proj":[113,144],"FC2":[115,146],"have":[117],"largest":[119],"output":[123],"magnitude.":[124],"This":[125],"prompts":[126],"us":[127],"explore":[129],"several":[130],"options:":[131],"1)":[132],"apply":[133,150,162],"normalization":[135,152,165,197],"after":[138,143,153],"QK":[139,163,195],"also":[142],"too;":[148],"2)":[149],"QKV":[155],"(and":[157],"remove":[158],"pre":[159],"normalization).":[160],"3)":[161],"together":[166],"softmax":[168],"capping.":[169],"show":[171],"last":[175],"two":[176],"methods":[177,208],"increase":[180],"by":[183],"1.5x":[184],"(without":[185],"divergence)":[187],"comparison":[189,210],"approach":[192],"based":[193],"on":[194],"only.":[198],"Also":[199],"significant":[202],"perplexity":[203],"improvements":[204],"for":[205],"three":[207],"baseline":[213],"model.":[214]},"counts_by_year":[],"updated_date":"2026-03-08T08:50:53.379069","created_date":"2025-10-10T00:00:00"}
