{"id":"https://openalex.org/W4406072150","doi":"https://doi.org/10.48550/arxiv.2407.14679","title":"Compact Language Models via Pruning and Knowledge Distillation","display_name":"Compact Language Models via Pruning and Knowledge Distillation","publication_year":2024,"publication_date":"2024-07-19","ids":{"openalex":"https://openalex.org/W4406072150","doi":"https://doi.org/10.48550/arxiv.2407.14679"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2407.14679","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.14679","pdf_url":"https://arxiv.org/pdf/2407.14679","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2407.14679","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5099241641","display_name":"Saurav Muralidharan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muralidharan, Saurav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084832562","display_name":"Sharath Turuvekere Sreenivas","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sreenivas, Sharath Turuvekere","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009725385","display_name":"Raviraj Joshi","orcid":"https://orcid.org/0000-0003-1892-1812"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joshi, Raviraj","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107404727","display_name":"Marcin Chochowski","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chochowski, Marcin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5031170568","display_name":"Mostofa Patwary","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patwary, Mostofa","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072436307","display_name":"Mohammad Shoeybi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shoeybi, Mohammad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066242985","display_name":"Bryan Catanzaro","orcid":"https://orcid.org/0000-0003-0034-7728"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Catanzaro, Bryan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056503617","display_name":"Jan Kautz","orcid":"https://orcid.org/0000-0002-8830-429X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kautz, Jan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066945976","display_name":"Pavlo Molchanov","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Molchanov, Pavlo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":10,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7242000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7242000102996826,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.695847749710083},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.677491307258606},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5117530822753906},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.48205599188804626},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.47822827100753784},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.33844494819641113},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.2495383322238922},{"id":"https://openalex.org/keywords/chromatography","display_name":"Chromatography","score":0.16281360387802124},{"id":"https://openalex.org/keywords/botany","display_name":"Botany","score":0.12471258640289307},{"id":"https://openalex.org/keywords/biology","display_name":"Biology","score":0.1157296895980835}],"concepts":[{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.695847749710083},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.677491307258606},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5117530822753906},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.48205599188804626},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47822827100753784},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.33844494819641113},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.2495383322238922},{"id":"https://openalex.org/C43617362","wikidata":"https://www.wikidata.org/wiki/Q170050","display_name":"Chromatography","level":1,"score":0.16281360387802124},{"id":"https://openalex.org/C59822182","wikidata":"https://www.wikidata.org/wiki/Q441","display_name":"Botany","level":1,"score":0.12471258640289307},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.1157296895980835}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2407.14679","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.14679","pdf_url":"https://arxiv.org/pdf/2407.14679","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2407.14679","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2407.14679","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2407.14679","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2407.14679","pdf_url":"https://arxiv.org/pdf/2407.14679","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4406072150.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W1979597421","https://openalex.org/W2007980826","https://openalex.org/W2061531152","https://openalex.org/W3002753104","https://openalex.org/W2077600819","https://openalex.org/W2142036596","https://openalex.org/W2072657027","https://openalex.org/W2962838298","https://openalex.org/W3204019825"],"abstract_inverted_index":{"Large":[0],"language":[1,142],"models":[2,137,149,192,212],"(LLMs)":[3],"targeting":[4],"different":[5],"deployment":[6],"scales":[7],"and":[8,33,64,76,106,131,147,189,219,222],"sizes":[9],"are":[10],"currently":[11],"produced":[12],"by":[13,126],"training":[14,44,164,170,182,204],"each":[15,98],"variant":[16],"from":[17,150,171,205,227],"scratch;":[18,172],"this":[19,24,56,117,173],"is":[20],"extremely":[21],"compute-intensive.":[22],"In":[23],"paper,":[25],"we":[26,58,83],"investigate":[27],"if":[28],"pruning":[29,78,95],"an":[30,151],"existing":[31],"LLM":[32],"then":[34],"re-training":[35],"it":[36],"with":[37,79,238],"a":[38,48,60,90,127,139,196],"fraction":[39],"(&lt;3%)":[40],"of":[41,62,94,124,129,141,179],"the":[42,121,183,228],"original":[43],"data":[45],"can":[46],"be":[47],"suitable":[49],"alternative":[50],"to":[51,101,119,135,161,169,195,203,209],"repeated,":[52],"full":[53,184],"retraining.":[54],"To":[55],"end,":[57],"develop":[59],"set":[61],"practical":[63],"effective":[65],"compression":[66,225],"best":[67,87],"practices":[68,88],"for":[69,97,109,181],"LLMs":[70,125],"that":[71],"combine":[72,102],"depth,":[73],"width,":[74],"attention":[75],"MLP":[77],"knowledge":[80],"distillation-based":[81],"retraining;":[82],"arrive":[84],"at":[85,111],"these":[86],"through":[89],"detailed":[91],"empirical":[92],"exploration":[93],"strategies":[96],"axis,":[99],"methods":[100],"axes,":[103],"distillation":[104],"strategies,":[105],"search":[107],"techniques":[108,226],"arriving":[110],"optimal":[112],"compressed":[113],"architectures.":[114],"We":[115,230],"use":[116],"guide":[118],"compress":[120],"Nemotron-4":[122],"family":[123,186],"factor":[128],"2-4x,":[130],"compare":[132],"their":[133],"performance":[134],"similarly-sized":[136],"on":[138,236,246],"variety":[140],"modeling":[143],"tasks.":[144],"Deriving":[145],"8B":[146],"4B":[148],"already":[152],"pretrained":[153],"15B":[154],"model":[155,167,185,234],"using":[156],"our":[157],"approach":[158],"requires":[159],"up":[160,194],"40x":[162],"fewer":[163],"tokens":[165],"per":[166],"compared":[168,202],"results":[174],"in":[175,199],"compute":[176],"cost":[177],"savings":[178],"1.8x":[180],"(15B,":[187],"8B,":[188,221],"4B).":[190],"Minitron":[191,233],"exhibit":[193],"16%":[197],"improvement":[198],"MMLU":[200],"scores":[201],"scratch,":[206],"perform":[207],"comparably":[208],"other":[210],"community":[211],"such":[213],"as":[214],"Mistral":[215],"7B,":[216],"Gemma":[217],"7B":[218],"Llama-3":[220],"outperform":[223],"state-of-the-art":[224],"literature.":[229],"have":[231],"open-sourced":[232],"weights":[235],"Huggingface,":[237],"corresponding":[239],"supplementary":[240],"material":[241],"including":[242],"example":[243],"code":[244],"available":[245],"GitHub.":[247]},"counts_by_year":[{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
