{"id":"https://openalex.org/W3202666514","doi":"https://doi.org/10.1109/isocc53507.2021.9613933","title":"Layer-wise Pruning of Transformer Attention Heads for Efficient Language Modeling","display_name":"Layer-wise Pruning of Transformer Attention Heads for Efficient Language Modeling","publication_year":2021,"publication_date":"2021-10-06","ids":{"openalex":"https://openalex.org/W3202666514","doi":"https://doi.org/10.1109/isocc53507.2021.9613933","mag":"3202666514"},"language":"en","primary_location":{"id":"doi:10.1109/isocc53507.2021.9613933","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isocc53507.2021.9613933","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 18th International SoC Design Conference (ISOCC)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2110.03252","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064051041","display_name":"Kyuhong Shim","orcid":"https://orcid.org/0000-0002-0123-3100"},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Kyuhong Shim","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","Seoul National University, Seoul, South Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I139264467"]},{"raw_affiliation_string":"Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I139264467"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113822797","display_name":"Iksoo Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I10654025","display_name":"SK Group (United States)","ror":"https://ror.org/00qajw440","country_code":"US","type":"company","lineage":["https://openalex.org/I10654025","https://openalex.org/I134353371"]},{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR","US"],"is_corresponding":false,"raw_author_name":"Iksoo Choi","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","SK Group (United States), San Jose, United States"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I139264467"]},{"raw_affiliation_string":"SK Group (United States), San Jose, United States","institution_ids":["https://openalex.org/I10654025"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113491293","display_name":"Wonyong Sung","orcid":null},"institutions":[{"id":"https://openalex.org/I139264467","display_name":"Seoul National University","ror":"https://ror.org/04h9pn542","country_code":"KR","type":"education","lineage":["https://openalex.org/I139264467"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Wonyong Sung","raw_affiliation_strings":["Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","Seoul National University, Seoul, South Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I139264467"]},{"raw_affiliation_string":"Seoul National University, Seoul, South Korea","institution_ids":["https://openalex.org/I139264467"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078440061","display_name":"Jungwook Choi","orcid":"https://orcid.org/0000-0002-3075-8694"},"institutions":[{"id":"https://openalex.org/I4575257","display_name":"Hanyang University","ror":"https://ror.org/046865y68","country_code":"KR","type":"education","lineage":["https://openalex.org/I4575257"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jungwook Choi","raw_affiliation_strings":["Department of Electrical Engineering, Hanyang University, Seoul, South Korea","Hanyang University, Seoul, South Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering, Hanyang University, Seoul, South Korea","institution_ids":["https://openalex.org/I4575257"]},{"raw_affiliation_string":"Hanyang University, Seoul, South Korea","institution_ids":["https://openalex.org/I4575257"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.1362,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.5494914,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":null,"issue":null,"first_page":"357","last_page":"358"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9976999759674072,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.9069507718086243},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.8071925640106201},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.7684438228607178},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7633979916572571},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6851527690887451},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.5954560041427612},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.43660303950309753},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4323994815349579},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3518337309360504},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.26249921321868896},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.07265442609786987}],"concepts":[{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.9069507718086243},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.8071925640106201},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.7684438228607178},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7633979916572571},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6851527690887451},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.5954560041427612},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.43660303950309753},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4323994815349579},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3518337309360504},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.26249921321868896},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.07265442609786987},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C6557445","wikidata":"https://www.wikidata.org/wiki/Q173113","display_name":"Agronomy","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/isocc53507.2021.9613933","is_oa":false,"landing_page_url":"https://doi.org/10.1109/isocc53507.2021.9613933","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 18th International SoC Design Conference (ISOCC)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2110.03252","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2110.03252","pdf_url":"https://arxiv.org/pdf/2110.03252","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"mag:3202666514","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2110.03252.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2110.03252","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2110.03252","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2110.03252","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2110.03252","pdf_url":"https://arxiv.org/pdf/2110.03252","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7799999713897705}],"awards":[],"funders":[{"id":"https://openalex.org/F4320320671","display_name":"National Research Foundation","ror":"https://ror.org/05s0g1g46"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W3202666514.pdf"},"referenced_works_count":24,"referenced_works":["https://openalex.org/W2525332836","https://openalex.org/W2946567085","https://openalex.org/W2946794439","https://openalex.org/W2955227499","https://openalex.org/W2963341956","https://openalex.org/W2963403868","https://openalex.org/W2963828549","https://openalex.org/W2964110616","https://openalex.org/W2970565456","https://openalex.org/W2972324944","https://openalex.org/W2994721156","https://openalex.org/W3023489204","https://openalex.org/W3030163527","https://openalex.org/W3098985395","https://openalex.org/W3113057009","https://openalex.org/W3159727696","https://openalex.org/W3166574921","https://openalex.org/W3173787059","https://openalex.org/W3197737164","https://openalex.org/W6727099177","https://openalex.org/W6762287338","https://openalex.org/W6762945437","https://openalex.org/W6765264507","https://openalex.org/W6768742326"],"related_works":["https://openalex.org/W3211326523","https://openalex.org/W3170035135","https://openalex.org/W3098411449","https://openalex.org/W2808568129","https://openalex.org/W2950837708","https://openalex.org/W3150079761","https://openalex.org/W3122378989","https://openalex.org/W3197737164","https://openalex.org/W3196256595","https://openalex.org/W2899067310","https://openalex.org/W3139123974","https://openalex.org/W3206247281","https://openalex.org/W3049454714","https://openalex.org/W3159287413","https://openalex.org/W3128873951","https://openalex.org/W3186405819","https://openalex.org/W3173285071","https://openalex.org/W3013843428","https://openalex.org/W3097185144","https://openalex.org/W3008851394"],"abstract_inverted_index":{"Recently,":[0],"the":[1,44,68,73,84],"necessity":[2],"of":[3,75,86,105],"multiple":[4],"attention":[5,35,39,58,97],"heads":[6,17,36],"in":[7,37,67],"transformer":[8],"architecture":[9],"has":[10],"been":[11],"questioned":[12],"[1].":[13],"Removing":[14],"less":[15],"important":[16],"from":[18],"a":[19,23,101,108],"large":[20],"network":[21],"is":[22],"promising":[24],"strategy":[25],"to":[26,72],"reduce":[27,43],"computation":[28,69],"cost":[29,85],"and":[30],"parameters.":[31],"However,":[32],"pruning":[33,60,87,99],"out":[34],"multihead":[38],"does":[40],"not":[41,51],"evenly":[42],"overall":[45],"load,":[46],"because":[47],"feedforward":[48],"modules":[49],"are":[50,70],"affected.":[52],"In":[53],"this":[54],"study,":[55],"we":[56,90],"apply":[57],"head":[59,98],"on":[61],"All-attention":[62],"[2]":[63],"transformer,":[64],"where":[65],"savings":[66],"proportional":[71],"number":[74,104],"pruned":[76],"heads.":[77],"This":[78],"improved":[79],"computing":[80],"efficiency":[81],"comes":[82],"at":[83],"sensitivity,":[88],"which":[89],"stabilize":[91],"with":[92,107],"three":[93],"training":[94],"techniques.":[95],"Our":[96],"enables":[100],"considerably":[102],"fewer":[103],"parameters":[106],"comparable":[109],"perplexity":[110],"for":[111],"transformer-based":[112],"language":[113],"modeling.":[114]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-07-03T08:13:44.112507","created_date":"2025-10-10T00:00:00"}
