{"id":"https://openalex.org/W7131387397","doi":"https://doi.org/10.48550/arxiv.2602.20816","title":"Don't Ignore the Tail: Decoupling top-K Probabilities for Efficient Language Model Distillation","display_name":"Don't Ignore the Tail: Decoupling top-K Probabilities for Efficient Language Model Distillation","publication_year":2026,"publication_date":"2026-02-24","ids":{"openalex":"https://openalex.org/W7131387397","doi":"https://doi.org/10.48550/arxiv.2602.20816"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.20816","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003842642","display_name":"Sayantan Dasgupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dasgupta, Sayantan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120251487","display_name":"Trevor Cohn","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cohn, Trevor","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126831170","display_name":"Timothy Baldwin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baldwin, Timothy","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5003842642"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1859000027179718,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.1859000027179718,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.11569999903440475,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.07209999859333038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.8216000199317932},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.715499997138977},{"id":"https://openalex.org/keywords/decoupling","display_name":"Decoupling (probability)","score":0.6207000017166138},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5026999711990356},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4794999957084656},{"id":"https://openalex.org/keywords/multiple-effect-distillation","display_name":"Multiple-effect distillation","score":0.3199000060558319}],"concepts":[{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.8216000199317932},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.715499997138977},{"id":"https://openalex.org/C205606062","wikidata":"https://www.wikidata.org/wiki/Q5249645","display_name":"Decoupling (probability)","level":2,"score":0.6207000017166138},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.593500018119812},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5026999711990356},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4794999957084656},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3212999999523163},{"id":"https://openalex.org/C81637046","wikidata":"https://www.wikidata.org/wiki/Q3030722","display_name":"Multiple-effect distillation","level":4,"score":0.3199000060558319},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.319599986076355},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.3084999918937683},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.29010000824928284},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.28049999475479126},{"id":"https://openalex.org/C167928553","wikidata":"https://www.wikidata.org/wiki/Q1376021","display_name":"Estimation theory","level":2,"score":0.27070000767707825},{"id":"https://openalex.org/C171752962","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Kullback\u2013Leibler divergence","level":2,"score":0.26330000162124634}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.20816","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.20816","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20816","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20816","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"core":[1],"learning":[2],"signal":[3],"used":[4],"in":[5,119],"language":[6],"model":[7],"distillation":[8,114,124,133],"is":[9,135],"the":[10,16,29,33,37,42,52,63,66,79,84,91,94,100,103,106,132,150],"standard":[11],"Kullback-Leibler":[12],"(KL)":[13],"divergence":[14,23,60],"between":[15],"student":[17],"and":[18,122,137],"teacher":[19,67,95],"distributions.":[20],"Traditional":[21],"KL":[22,85],"tends":[24],"to":[25],"be":[26,139],"dominated":[27],"by":[28],"next":[30],"tokens":[31],"with":[32,141],"highest":[34],"probabilities,":[35],"i.e.,":[36],"teacher's":[38],"modes,":[39],"thereby":[40],"diminishing":[41],"influence":[43],"of":[44,51,65,74,93,102,105,125],"less":[45],"probable":[46],"yet":[47],"potentially":[48],"informative":[49],"components":[50],"output":[53],"distribution.":[54,107],"We":[55],"propose":[56],"a":[57,142],"new":[58],"tail-aware":[59],"that":[61,73,111],"decouples":[62],"contribution":[64,101],"model's":[68],"top-K":[69],"predicted":[70],"probabilities":[71],"from":[72],"lower-probability":[75],"predictions,":[76],"while":[77],"maintaining":[78],"same":[80],"computational":[81],"profile":[82],"as":[83],"Divergence.":[86],"Our":[87],"decoupled":[88],"approach":[89],"reduces":[90],"impact":[92],"modes":[96],"and,":[97],"consequently,":[98],"increases":[99],"tail":[104],"Experimental":[108],"results":[109],"demonstrate":[110],"our":[112],"modified":[113],"method":[115],"yields":[116],"competitive":[117],"performance":[118],"both":[120],"pre-training":[121],"supervised":[123],"decoder":[126],"models":[127],"across":[128],"various":[129],"datasets.":[130],"Furthermore,":[131],"process":[134],"efficient":[136],"can":[138],"performed":[140],"modest":[143],"academic":[144],"budget":[145],"for":[146,152],"large":[147],"datasets,":[148],"eliminating":[149],"need":[151],"industry-scale":[153],"computing.":[154]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-26T00:00:00"}
