{"id":"https://openalex.org/W4404263292","doi":"https://doi.org/10.3390/make6040126","title":"Diversifying Multi-Head Attention in the Transformer Model","display_name":"Diversifying Multi-Head Attention in the Transformer Model","publication_year":2024,"publication_date":"2024-11-12","ids":{"openalex":"https://openalex.org/W4404263292","doi":"https://doi.org/10.3390/make6040126"},"language":"en","primary_location":{"id":"doi:10.3390/make6040126","is_oa":true,"landing_page_url":"http://dx.doi.org/10.3390/make6040126","pdf_url":"https://www.mdpi.com/2504-4990/6/4/126/pdf?version=1731416888","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://www.mdpi.com/2504-4990/6/4/126/pdf?version=1731416888","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5034325768","display_name":"Nicholas Ampazis","orcid":"https://orcid.org/0000-0002-7238-4731"},"institutions":[{"id":"https://openalex.org/I98805295","display_name":"University of the Aegean","ror":"https://ror.org/03zsp3p94","country_code":"GR","type":"education","lineage":["https://openalex.org/I98805295"]}],"countries":["GR"],"is_corresponding":true,"raw_author_name":"Nicholas Ampazis","raw_affiliation_strings":["Department of Financial and Management Engineering, University of the Aegean, 82100 Chios, Greece"],"affiliations":[{"raw_affiliation_string":"Department of Financial and Management Engineering, University of the Aegean, 82100 Chios, Greece","institution_ids":["https://openalex.org/I98805295"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088735164","display_name":"Flora Sakketou","orcid":"https://orcid.org/0000-0002-6737-1661"},"institutions":[{"id":"https://openalex.org/I98805295","display_name":"University of the Aegean","ror":"https://ror.org/03zsp3p94","country_code":"GR","type":"education","lineage":["https://openalex.org/I98805295"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Flora Sakketou","raw_affiliation_strings":["Department of Financial and Management Engineering, University of the Aegean, 82100 Chios, Greece"],"affiliations":[{"raw_affiliation_string":"Department of Financial and Management Engineering, University of the Aegean, 82100 Chios, Greece","institution_ids":["https://openalex.org/I98805295"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5034325768"],"corresponding_institution_ids":["https://openalex.org/I98805295"],"apc_list":{"value":1400,"currency":"CHF","value_usd":1515},"apc_paid":{"value":1400,"currency":"CHF","value_usd":1515},"fwci":1.4489,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.85421478,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":100},"biblio":{"volume":"6","issue":"4","first_page":"2618","last_page":"2638"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9921000003814697,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11902","display_name":"Intelligent Tutoring Systems and Adaptive Learning","score":0.9714999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.528137743473053},{"id":"https://openalex.org/keywords/head","display_name":"Head (geology)","score":0.43986624479293823},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4397839903831482},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.18729883432388306},{"id":"https://openalex.org/keywords/geology","display_name":"Geology","score":0.13066712021827698},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.12997448444366455},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.06231537461280823}],"concepts":[{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.528137743473053},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.43986624479293823},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4397839903831482},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.18729883432388306},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.13066712021827698},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.12997448444366455},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.06231537461280823},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.3390/make6040126","is_oa":true,"landing_page_url":"http://dx.doi.org/10.3390/make6040126","pdf_url":"https://www.mdpi.com/2504-4990/6/4/126/pdf?version=1731416888","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:f4253eb2b4f448f88c2837138e97004d","is_oa":true,"landing_page_url":"https://doaj.org/article/f4253eb2b4f448f88c2837138e97004d","pdf_url":null,"source":{"id":"https://openalex.org/S112646816","display_name":"SHILAP Revista de lepidopterolog\u00eda","issn_l":"0300-5267","issn":["0300-5267","2340-4078"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning and Knowledge Extraction, Vol 6, Iss 4, Pp 2618-2638 (2024)","raw_type":"article"},{"id":"pmh:oai:mdpi.com:/2504-4990/6/4/126/","is_oa":true,"landing_page_url":"https://doi.org/10.3390/make6040126","pdf_url":null,"source":{"id":"https://openalex.org/S4306400947","display_name":"MDPI (MDPI AG)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210097602","host_organization_name":"Multidisciplinary Digital Publishing Institute (Switzerland)","host_organization_lineage":["https://openalex.org/I4210097602"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"Text"}],"best_oa_location":{"id":"doi:10.3390/make6040126","is_oa":true,"landing_page_url":"http://dx.doi.org/10.3390/make6040126","pdf_url":"https://www.mdpi.com/2504-4990/6/4/126/pdf?version=1731416888","source":{"id":"https://openalex.org/S4210213891","display_name":"Machine Learning and Knowledge Extraction","issn_l":"2504-4990","issn":["2504-4990"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310310987","host_organization_name":"Multidisciplinary Digital Publishing Institute","host_organization_lineage":["https://openalex.org/P4310310987"],"host_organization_lineage_names":["Multidisciplinary Digital Publishing Institute"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Machine Learning and Knowledge Extraction","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6000000238418579}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4404263292.pdf"},"referenced_works_count":16,"referenced_works":["https://openalex.org/W5731987","https://openalex.org/W2053746978","https://openalex.org/W2131329059","https://openalex.org/W2150634210","https://openalex.org/W2150884987","https://openalex.org/W2427527485","https://openalex.org/W2487770199","https://openalex.org/W2888482885","https://openalex.org/W2912351236","https://openalex.org/W2946794439","https://openalex.org/W2963909453","https://openalex.org/W3034296505","https://openalex.org/W3106504817","https://openalex.org/W4283832803","https://openalex.org/W4285225959","https://openalex.org/W6954847198"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Recent":[0],"studies":[1],"have":[2],"shown":[3],"that,":[4],"due":[5],"to":[6,47,75,102,114],"redundancy,":[7],"some":[8],"heads":[9,53,63,117],"of":[10,21,61,95,105,147],"the":[11,19,22,42,51,55,62,76,83,88,96,103,106,108,116,145],"Transformer":[12,43,77,98,149],"model":[13],"can":[14,111],"be":[15,112],"pruned":[16],"without":[17],"diminishing":[18],"efficiency":[20],"model.":[23,99],"In":[24,100],"this":[25],"paper,":[26],"we":[27],"propose":[28],"a":[29,67],"constrained":[30],"optimization":[31],"algorithm":[32,89],"based":[33],"on":[34,123],"Hebbian":[35],"learning,":[36],"which":[37],"trains":[38],"specific":[39],"layers":[40],"in":[41,45,54,90],"architecture":[44,78],"order":[46],"enforce":[48],"diversification":[49,60,104],"between":[50],"different":[52,92],"multi-head":[56],"attention":[57],"module.":[58],"The":[59],"is":[64,73,80],"achieved":[65],"through":[66],"single-layer":[68],"feed-forward":[69],"neural":[70],"network":[71],"that":[72,118,139],"added":[74],"and":[79,134],"trained":[81],"with":[82],"proposed":[84,109,141],"algorithm.":[85],"We":[86],"utilize":[87],"three":[91],"architectural":[93],"variations":[94],"baseline":[97,148],"addition":[101],"heads,":[107],"methodology":[110],"used":[113],"prune":[115],"capture":[119],"redundant":[120],"information.":[121],"Experiments":[122],"diverse":[124],"NLP":[125],"tasks,":[126],"including":[127],"machine":[128],"translation,":[129],"text":[130],"summarization,":[131],"question":[132],"answering":[133],"large":[135],"language":[136],"modeling,":[137],"show":[138],"our":[140],"approach":[142],"consistently":[143],"improves":[144],"performance":[146],"models.":[150]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-01T08:55:55.761014","created_date":"2025-10-10T00:00:00"}
