{"id":"https://openalex.org/W4407571370","doi":"https://doi.org/10.48550/arxiv.2502.08924","title":"Escaping Collapse: The Strength of Weak Data for Large Language Model Training","display_name":"Escaping Collapse: The Strength of Weak Data for Large Language Model Training","publication_year":2025,"publication_date":"2025-02-13","ids":{"openalex":"https://openalex.org/W4407571370","doi":"https://doi.org/10.48550/arxiv.2502.08924"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2502.08924","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.08924","pdf_url":"https://arxiv.org/pdf/2502.08924","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2502.08924","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101006637","display_name":"Kareem Amin","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Amin, Kareem","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116269881","display_name":"Sara Babakniya","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Babakniya, Sara","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004876675","display_name":"Alex Bie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bie, Alex","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102757104","display_name":"Weiwei Kong","orcid":"https://orcid.org/0000-0002-5013-3739"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Weiwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103121691","display_name":"Umar Syed","orcid":"https://orcid.org/0009-0007-3797-3614"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Syed, Umar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5070795618","display_name":"Sergei Vassilvitskii","orcid":"https://orcid.org/0000-0003-0235-1624"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vassilvitskii, Sergei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101006637"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9876000285148621,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9876000285148621,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9843000173568726,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.573456346988678},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.3882145881652832},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.14636000990867615},{"id":"https://openalex.org/keywords/meteorology","display_name":"Meteorology","score":0.06574815511703491}],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.573456346988678},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.3882145881652832},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.14636000990867615},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.06574815511703491}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2502.08924","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.08924","pdf_url":"https://arxiv.org/pdf/2502.08924","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2502.08924","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2502.08924","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2502.08924","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2502.08924","pdf_url":"https://arxiv.org/pdf/2502.08924","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4407571370.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Synthetically-generated":[0],"data":[1,15],"plays":[2],"an":[3,92],"increasingly":[4],"larger":[5],"role":[6],"in":[7,63,149],"training":[8,42,106],"large":[9],"language":[10],"models.":[11],"However,":[12],"while":[13],"synthetic":[14,109],"has":[16],"been":[17],"found":[18],"to":[19,35,56,65,90,165],"be":[20],"useful,":[21],"studies":[22],"have":[23],"also":[24,123],"shown":[25],"that":[26,67,83,132,138,154],"without":[27],"proper":[28],"curation":[29,60],"it":[30],"can":[31],"cause":[32],"LLM":[33,68],"performance":[34,69],"plateau,":[36],"or":[37],"even":[38],"\"collapse\",":[39],"after":[40],"many":[41,101],"iterations.":[43],"In":[44],"this":[45,49],"paper,":[46],"we":[47,98],"formalize":[48],"question":[50],"and":[51,111,122,136],"develop":[52],"a":[53,78,85],"theoretical":[54],"framework":[55],"investigate":[57],"how":[58],"much":[59,150],"is":[61,74],"needed":[62],"order":[64],"ensure":[66],"continually":[70],"improves.":[71],"Our":[72],"analysis":[73,114],"inspired":[75],"by":[76],"boosting,":[77],"classic":[79],"machine":[80],"learning":[81,88],"technique":[82],"leverages":[84],"very":[86],"weak":[87,161],"algorithm":[89],"produce":[91],"arbitrarily":[93],"good":[94],"classifier.":[95],"The":[96],"approach":[97],"analyze":[99],"subsumes":[100],"recently":[102],"proposed":[103],"methods":[104],"for":[105,126],"LLMs":[107],"on":[108,117,143],"data,":[110],"thus":[112],"our":[113,134],"sheds":[115],"light":[116],"why":[118],"they":[119],"are":[120],"successful,":[121],"suggests":[124],"opportunities":[125],"future":[127],"improvement.":[128],"We":[129],"present":[130],"experiments":[131],"validate":[133],"theory,":[135],"show":[137],"dynamically":[139],"focusing":[140],"labeling":[141],"resources":[142],"the":[144,151,157,160],"most":[145],"challenging":[146],"examples":[147],"--":[148,163],"same":[152],"way":[153],"boosting":[155],"focuses":[156],"efforts":[158],"of":[159],"learner":[162],"leads":[164],"improved":[166],"performance.":[167]},"counts_by_year":[],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
