{"id":"https://openalex.org/W4403572656","doi":"https://doi.org/10.48550/arxiv.2410.09982","title":"Self-Data Distillation for Recovering Quality in Pruned Large Language Models","display_name":"Self-Data Distillation for Recovering Quality in Pruned Large Language Models","publication_year":2024,"publication_date":"2024-10-13","ids":{"openalex":"https://openalex.org/W4403572656","doi":"https://doi.org/10.48550/arxiv.2410.09982"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2410.09982","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09982","pdf_url":"https://arxiv.org/pdf/2410.09982","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2410.09982","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114335807","display_name":"Vithursan Thangarasa","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Thangarasa, Vithursan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001820563","display_name":"G. Venkatesh","orcid":"https://orcid.org/0000-0003-3347-7262"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Venkatesh, Ganesh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lasby, Mike","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lasby, Mike","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034937688","display_name":"Nihal Sinnadurai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sinnadurai, Nish","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5001174692","display_name":"Sean Lie","orcid":"https://orcid.org/0000-0001-6074-4640"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lie, Sean","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5114335807"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.800000011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.800000011920929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7989000082015991,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.7327655553817749},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.6281338930130005},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.47785302996635437},{"id":"https://openalex.org/keywords/process-engineering","display_name":"Process engineering","score":0.33283179998397827},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.2664456367492676},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.2314348816871643},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09449112415313721},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.05217057466506958}],"concepts":[{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.7327655553817749},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.6281338930130005},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.47785302996635437},{"id":"https://openalex.org/C21880701","wikidata":"https://www.wikidata.org/wiki/Q2144042","display_name":"Process engineering","level":1,"score":0.33283179998397827},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.2664456367492676},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.2314348816871643},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09449112415313721},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.05217057466506958},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2410.09982","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09982","pdf_url":"https://arxiv.org/pdf/2410.09982","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2410.09982","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2410.09982","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2410.09982","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.09982","pdf_url":"https://arxiv.org/pdf/2410.09982","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4403572656.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W3085764877","https://openalex.org/W2514414740","https://openalex.org/W2377414158","https://openalex.org/W3199615306","https://openalex.org/W77207468","https://openalex.org/W3212781313","https://openalex.org/W4307725381","https://openalex.org/W124863575","https://openalex.org/W3203147184","https://openalex.org/W2037691954"],"abstract_inverted_index":{"Large":[0],"language":[1,9],"models":[2,21,217,229],"have":[3],"driven":[4],"significant":[5,57],"progress":[6],"in":[7,56,61,230,241],"natural":[8],"processing,":[10],"but":[11,76],"their":[12],"deployment":[13],"requires":[14],"substantial":[15],"compute":[16],"and":[17,97,135],"memory":[18],"resources.":[19],"As":[20],"scale,":[22],"compression":[23],"techniques":[24],"become":[25],"essential":[26,100],"for":[27,48],"balancing":[28],"model":[29,125,186,219],"quality":[30,58,223],"with":[31,142,205],"computational":[32],"efficiency.":[33],"Structured":[34],"pruning,":[35],"which":[36],"removes":[37],"less":[38],"critical":[39],"components":[40],"of":[41,197],"the":[42,85,92,103,122,143,165,185,198],"model,":[43],"is":[44,73,99],"a":[45,128],"promising":[46],"strategy":[47],"reducing":[49,184,208],"complexity.":[50],"However,":[51],"one-shot":[52],"pruning":[53,96,172],"often":[54],"results":[55],"degradation,":[59],"particularly":[60],"tasks":[62],"requiring":[63],"multi-step":[64],"reasoning.":[65],"To":[66],"recover":[67],"lost":[68],"quality,":[69],"supervised":[70],"fine-tuning":[71,114],"(SFT)":[72],"commonly":[74],"applied,":[75],"it":[77],"can":[78],"lead":[79],"to":[80,101,115,126,162,181,190,203],"catastrophic":[81,137],"forgetting":[82,138],"by":[83,139,160,211],"shifting":[84],"model's":[86,105,145,200],"learned":[87],"data":[88],"distribution.":[89],"Therefore,":[90],"addressing":[91],"degradation":[93],"from":[94,188],"both":[95],"SFT":[98],"preserve":[102],"original":[104,199],"quality.":[106],"In":[107],"this":[108],"work,":[109],"we":[110,148],"utilize":[111],"self-data":[112,151,215],"distilled":[113,129,216],"address":[116],"these":[117,227],"challenges.":[118],"Our":[119],"approach":[120],"leverages":[121],"original,":[123],"unpruned":[124],"generate":[127],"dataset":[130],"that":[131,150],"preserves":[132],"semantic":[133],"richness":[134],"mitigates":[136],"maintaining":[140],"alignment":[141],"base":[144],"knowledge.":[146],"Empirically,":[147],"demonstrate":[149],"distillation":[152],"consistently":[153],"outperforms":[154],"standard":[155],"SFT,":[156,206],"improving":[157,238],"average":[158],"accuracy":[159,201],"up":[161],"8%":[163],"on":[164,176],"HuggingFace":[166],"OpenLLM":[167],"Leaderboard":[168],"v1.":[169],"Specifically,":[170],"when":[171],"six":[173],"decoder":[174],"blocks":[175],"Llama3.1-8B":[177],"Instruct":[178],"(i.e.,":[179],"32":[180],"26":[182],"layers,":[183],"size":[187],"8.03B":[189],"6.72B":[191],"parameters),":[192],"our":[193],"method":[194],"retains":[195],"91.2%":[196],"compared":[202],"81.7%":[204],"while":[207],"real-world":[209],"FLOPs":[210],"16.3%.":[212],"Furthermore,":[213],"combining":[214],"through":[218],"merging":[220],"yields":[221],"enhanced":[222],"retention.":[224],"Additionally,":[225],"leveraging":[226],"pruned":[228],"speculative":[231],"decoding":[232],"increases":[233],"token":[234],"acceptance":[235],"rates,":[236],"thereby":[237],"inference":[239],"efficiency":[240],"applied":[242],"settings.":[243]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2025-10-10T00:00:00"}
