{"id":"https://openalex.org/W4401864276","doi":"https://doi.org/10.1145/3637528.3671851","title":"Knowledge Distillation with Perturbed Loss: From a Vanilla Teacher to a Proxy Teacher","display_name":"Knowledge Distillation with Perturbed Loss: From a Vanilla Teacher to a Proxy Teacher","publication_year":2024,"publication_date":"2024-08-24","ids":{"openalex":"https://openalex.org/W4401864276","doi":"https://doi.org/10.1145/3637528.3671851"},"language":"en","primary_location":{"id":"doi:10.1145/3637528.3671851","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3637528.3671851","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3637528.3671851","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3637528.3671851","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5089455757","display_name":"Rongzhi Zhang","orcid":"https://orcid.org/0000-0002-7136-7913"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rongzhi Zhang","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041327449","display_name":"Jiaming Shen","orcid":"https://orcid.org/0000-0002-0467-4956"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiaming Shen","raw_affiliation_strings":["Google, New York City, NY, USA"],"affiliations":[{"raw_affiliation_string":"Google, New York City, NY, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5067433407","display_name":"Tianqi Liu","orcid":"https://orcid.org/0000-0003-4497-3317"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tianqi Liu","raw_affiliation_strings":["Google, New York City, NY, USA"],"affiliations":[{"raw_affiliation_string":"Google, New York City, NY, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053476114","display_name":"Jialu Liu","orcid":"https://orcid.org/0000-0002-8721-8656"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jialu Liu","raw_affiliation_strings":["Google, New York City, NY, USA"],"affiliations":[{"raw_affiliation_string":"Google, New York City, NY, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032248436","display_name":"Michael Bendersky","orcid":"https://orcid.org/0000-0002-2941-6240"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Bendersky","raw_affiliation_strings":["Google, Mountain View, CA, USA","Google, New York City, NY, USA"],"affiliations":[{"raw_affiliation_string":"Google, Mountain View, CA, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google, New York City, NY, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037200145","display_name":"Marc Najork","orcid":"https://orcid.org/0000-0003-1423-0854"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Marc Najork","raw_affiliation_strings":["Google, Mountain View, CA, USA","Google, New York City, NY, USA"],"affiliations":[{"raw_affiliation_string":"Google, Mountain View, CA, USA","institution_ids":["https://openalex.org/I1291425158"]},{"raw_affiliation_string":"Google, New York City, NY, USA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100460272","display_name":"Chao Zhang","orcid":"https://orcid.org/0000-0003-3009-598X"},"institutions":[{"id":"https://openalex.org/I130701444","display_name":"Georgia Institute of Technology","ror":"https://ror.org/01zkghx44","country_code":"US","type":"education","lineage":["https://openalex.org/I130701444"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Chao Zhang","raw_affiliation_strings":["Georgia Institute of Technology, Atlanta, GA, USA"],"affiliations":[{"raw_affiliation_string":"Georgia Institute of Technology, Atlanta, GA, USA","institution_ids":["https://openalex.org/I130701444"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5089455757"],"corresponding_institution_ids":["https://openalex.org/I130701444"],"apc_list":null,"apc_paid":null,"fwci":1.0425,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.80661486,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"4278","last_page":"4289"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9961000084877014,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/proxy","display_name":"Proxy (statistics)","score":0.6295493245124817},{"id":"https://openalex.org/keywords/closeness","display_name":"Closeness","score":0.614752471446991},{"id":"https://openalex.org/keywords/generalizability-theory","display_name":"Generalizability theory","score":0.5492071509361267},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4965398907661438},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.45012664794921875},{"id":"https://openalex.org/keywords/distillation","display_name":"Distillation","score":0.4259020686149597},{"id":"https://openalex.org/keywords/distribution","display_name":"Distribution (mathematics)","score":0.4195692241191864},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.32793331146240234},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.25579458475112915},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.16556423902511597},{"id":"https://openalex.org/keywords/mathematical-analysis","display_name":"Mathematical analysis","score":0.09478774666786194}],"concepts":[{"id":"https://openalex.org/C2780148112","wikidata":"https://www.wikidata.org/wiki/Q1432581","display_name":"Proxy (statistics)","level":2,"score":0.6295493245124817},{"id":"https://openalex.org/C2779545769","wikidata":"https://www.wikidata.org/wiki/Q5135364","display_name":"Closeness","level":2,"score":0.614752471446991},{"id":"https://openalex.org/C27158222","wikidata":"https://www.wikidata.org/wiki/Q5532422","display_name":"Generalizability theory","level":2,"score":0.5492071509361267},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4965398907661438},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.45012664794921875},{"id":"https://openalex.org/C204030448","wikidata":"https://www.wikidata.org/wiki/Q101017","display_name":"Distillation","level":2,"score":0.4259020686149597},{"id":"https://openalex.org/C110121322","wikidata":"https://www.wikidata.org/wiki/Q865811","display_name":"Distribution (mathematics)","level":2,"score":0.4195692241191864},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.32793331146240234},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.25579458475112915},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.16556423902511597},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.09478774666786194},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3637528.3671851","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3637528.3671851","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3637528.3671851","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3637528.3671851","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3637528.3671851","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3637528.3671851","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4401864276.pdf"},"referenced_works_count":15,"referenced_works":["https://openalex.org/W2183341477","https://openalex.org/W2294370754","https://openalex.org/W2963351448","https://openalex.org/W2981852735","https://openalex.org/W3034756453","https://openalex.org/W3094460838","https://openalex.org/W3152607317","https://openalex.org/W3174427901","https://openalex.org/W3177196641","https://openalex.org/W3212341733","https://openalex.org/W4226426325","https://openalex.org/W4285202066","https://openalex.org/W4287121309","https://openalex.org/W4288089799","https://openalex.org/W6796782416"],"related_works":["https://openalex.org/W2118717649","https://openalex.org/W2413243053","https://openalex.org/W410723623","https://openalex.org/W2015341305","https://openalex.org/W2035068594","https://openalex.org/W4225593417","https://openalex.org/W2573498121","https://openalex.org/W3022298670","https://openalex.org/W3160494304","https://openalex.org/W3006162251"],"abstract_inverted_index":{"Knowledge":[0],"distillation":[1,93,102],"is":[2,51],"a":[3,10,15,48,56,90,106,127,131,163],"popular":[4],"technique":[5],"to":[6,14,23,73,82,134,156],"transfer":[7],"knowledge":[8,92],"from":[9],"large":[11],"teacher":[12,26,78,125,129],"model":[13,151],"small":[16],"student":[17,21,72,150],"model.":[18],"Typically,":[19],"the":[20,25,29,37,59,64,71,76,99,112,123,135,141,149,158,174],"learns":[22],"imitate":[24,75],"by":[27,96],"minimizing":[28],"KL":[30],"divergence":[31],"of":[32,176,180],"its":[33],"output":[34,39,61,79],"distribution":[35,62,80,132],"with":[36,130,178],"teacher's":[38,60],"distribution.":[40,68,138],"In":[41],"this":[42,86,116,145],"work,":[43],"we":[44,88],"argue":[45],"that":[46],"such":[47],"learning":[49],"objective":[50,94],"sub-optimal":[52],"because":[53],"there":[54],"exists":[55],"discrepancy":[57],"between":[58,144],"and":[63,109,148],"ground":[65,136],"truth":[66,137],"label":[67],"Therefore,":[69],"forcing":[70],"blindly":[74],"unreliable":[77],"leads":[81],"inferior":[83],"performance.":[84],"To":[85],"end,":[87],"propose":[89],"novel":[91],"PTLoss":[95,177],"first":[97],"representing":[98],"vanilla":[100],"KL-based":[101],"loss":[103,120],"function":[104],"via":[105],"Maclaurin":[107],"series":[108],"then":[110],"perturbing":[111],"leading-order":[113],"terms":[114],"in":[115,162],"series.":[117],"This":[118],"perturbed":[119],"implicitly":[121],"transforms":[122],"original":[124],"into":[126],"proxy":[128],"closer":[133],"We":[139],"establish":[140],"theoretical":[142],"connection":[143],"\"distribution":[146],"closeness''":[147],"generalizability,":[152],"which":[153],"enables":[154],"us":[155],"select":[157],"PTLoss's":[159],"perturbation":[160],"coefficients":[161],"principled":[164],"way.":[165],"Extensive":[166],"experiments":[167],"on":[168],"six":[169],"public":[170],"benchmark":[171],"datasets":[172],"demonstrate":[173],"effectiveness":[175],"teachers":[179],"different":[181],"scales.":[182]},"counts_by_year":[{"year":2025,"cited_by_count":3}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
