{"id":"https://openalex.org/W7134118118","doi":"https://doi.org/10.1016/j.neunet.2026.108800","title":"EmbBERT: Attention under 2\u202fMB memory","display_name":"EmbBERT: Attention under 2\u202fMB memory","publication_year":2026,"publication_date":"2026-03-06","ids":{"openalex":"https://openalex.org/W7134118118","doi":"https://doi.org/10.1016/j.neunet.2026.108800","pmid":"https://pubmed.ncbi.nlm.nih.gov/41819621"},"language":"en","primary_location":{"id":"doi:10.1016/j.neunet.2026.108800","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.neunet.2026.108800","pdf_url":null,"source":{"id":"https://openalex.org/S123019304","display_name":"Neural Networks","issn_l":"0893-6080","issn":["0893-6080","1879-2782"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Networks","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","pubmed"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1016/j.neunet.2026.108800","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038766711","display_name":"Riccardo Bravin","orcid":"https://orcid.org/0000-0002-5453-1988"},"institutions":[{"id":"https://openalex.org/I93860229","display_name":"Politecnico di Milano","ror":"https://ror.org/01nffqt88","country_code":"IT","type":"education","lineage":["https://openalex.org/I93860229"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Riccardo Bravin","raw_affiliation_strings":["Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: riccardo.bravin@polimi.it"],"affiliations":[{"raw_affiliation_string":"Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: riccardo.bravin@polimi.it","institution_ids":["https://openalex.org/I93860229"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045681032","display_name":"Massimo Pavan","orcid":"https://orcid.org/0000-0002-5964-5685"},"institutions":[{"id":"https://openalex.org/I93860229","display_name":"Politecnico di Milano","ror":"https://ror.org/01nffqt88","country_code":"IT","type":"education","lineage":["https://openalex.org/I93860229"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Massimo Pavan","raw_affiliation_strings":["Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: massimo.pavan@polimi.it"],"affiliations":[{"raw_affiliation_string":"Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: massimo.pavan@polimi.it","institution_ids":["https://openalex.org/I93860229"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107079574","display_name":"Hazem Hesham Yousef Shalby","orcid":null},"institutions":[{"id":"https://openalex.org/I93860229","display_name":"Politecnico di Milano","ror":"https://ror.org/01nffqt88","country_code":"IT","type":"education","lineage":["https://openalex.org/I93860229"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Hazem Hesham Yousef Shalby","raw_affiliation_strings":["Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: hazemhesham.shalby@polimi.it"],"affiliations":[{"raw_affiliation_string":"Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: hazemhesham.shalby@polimi.it","institution_ids":["https://openalex.org/I93860229"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5035460406","display_name":"Fabrizio Pittorino","orcid":"https://orcid.org/0000-0002-1919-6141"},"institutions":[{"id":"https://openalex.org/I93860229","display_name":"Politecnico di Milano","ror":"https://ror.org/01nffqt88","country_code":"IT","type":"education","lineage":["https://openalex.org/I93860229"]}],"countries":["IT"],"is_corresponding":true,"raw_author_name":"Fabrizio Pittorino","raw_affiliation_strings":["Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: fabrizio.pittorino@polimi.it"],"affiliations":[{"raw_affiliation_string":"Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: fabrizio.pittorino@polimi.it","institution_ids":["https://openalex.org/I93860229"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5035547226","display_name":"Manuel Roveri","orcid":"https://orcid.org/0000-0001-7828-7687"},"institutions":[{"id":"https://openalex.org/I93860229","display_name":"Politecnico di Milano","ror":"https://ror.org/01nffqt88","country_code":"IT","type":"education","lineage":["https://openalex.org/I93860229"]}],"countries":["IT"],"is_corresponding":false,"raw_author_name":"Manuel Roveri","raw_affiliation_strings":["Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: manuel.roveri@polimi.it"],"affiliations":[{"raw_affiliation_string":"Department of Electronics, Information and Bioengineering, Politecnico di Milano, Via Ponzio 34/5, Milano, 20133, Italy. Electronic address: manuel.roveri@polimi.it","institution_ids":["https://openalex.org/I93860229"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5035460406"],"corresponding_institution_ids":["https://openalex.org/I93860229"],"apc_list":{"value":3350,"currency":"USD","value_usd":3350},"apc_paid":{"value":3350,"currency":"USD","value_usd":3350},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.50969251,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"200","issue":null,"first_page":"108800","last_page":"108800"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07109999656677246,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07109999656677246,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10465","display_name":"Neurobiology of Language and Bilingualism","score":0.052400000393390656,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T10094","display_name":"Epilepsy research and treatment","score":0.048900000751018524,"subfield":{"id":"https://openalex.org/subfields/2738","display_name":"Psychiatry and Mental health"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6753000020980835},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.515999972820282},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.4523000121116638},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.45170000195503235},{"id":"https://openalex.org/keywords/architecture","display_name":"Architecture","score":0.4397999942302704},{"id":"https://openalex.org/keywords/embedding","display_name":"Embedding","score":0.438400000333786},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4092999994754791},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.396699994802475},{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.3614000082015991}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8464999794960022},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6753000020980835},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.515999972820282},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.5115000009536743},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.4523000121116638},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.45170000195503235},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.4397999942302704},{"id":"https://openalex.org/C41608201","wikidata":"https://www.wikidata.org/wiki/Q980509","display_name":"Embedding","level":2,"score":0.438400000333786},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4092999994754791},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.40209999680519104},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.396699994802475},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.39070001244544983},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36570000648498535},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.3614000082015991},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.3467000126838684},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.32199999690055847},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.3190999925136566},{"id":"https://openalex.org/C2778738651","wikidata":"https://www.wikidata.org/wiki/Q16546687","display_name":"Novelty","level":2,"score":0.2985999882221222},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C12186640","wikidata":"https://www.wikidata.org/wiki/Q6815743","display_name":"Memory model","level":3,"score":0.2833999991416931},{"id":"https://openalex.org/C2781357197","wikidata":"https://www.wikidata.org/wiki/Q5757597","display_name":"High memory","level":2,"score":0.28220000863075256},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2745000123977661},{"id":"https://openalex.org/C57863822","wikidata":"https://www.wikidata.org/wiki/Q905488","display_name":"Flat memory model","level":4,"score":0.27219998836517334},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.26750001311302185},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2603999972343445},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.2542000114917755},{"id":"https://openalex.org/C177950962","wikidata":"https://www.wikidata.org/wiki/Q10997658","display_name":"Non-volatile memory","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1016/j.neunet.2026.108800","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.neunet.2026.108800","pdf_url":null,"source":{"id":"https://openalex.org/S123019304","display_name":"Neural Networks","issn_l":"0893-6080","issn":["0893-6080","1879-2782"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Networks","raw_type":"journal-article"},{"id":"pmid:41819621","is_oa":false,"landing_page_url":"https://pubmed.ncbi.nlm.nih.gov/41819621","pdf_url":null,"source":{"id":"https://openalex.org/S4306525036","display_name":"PubMed","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1299303238","host_organization_name":"National Institutes of Health","host_organization_lineage":["https://openalex.org/I1299303238"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural networks : the official journal of the International Neural Network Society","raw_type":null},{"id":"pmh:oai:re.public.polimi.it:11311/1309735","is_oa":false,"landing_page_url":"https://hdl.handle.net/11311/1309735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400312","display_name":"Virtual Community of Pathological Anatomy (University of Castilla La Mancha)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I79189158","host_organization_name":"University of Castilla-La Mancha","host_organization_lineage":["https://openalex.org/I79189158"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/article"}],"best_oa_location":{"id":"doi:10.1016/j.neunet.2026.108800","is_oa":true,"landing_page_url":"https://doi.org/10.1016/j.neunet.2026.108800","pdf_url":null,"source":{"id":"https://openalex.org/S123019304","display_name":"Neural Networks","issn_l":"0893-6080","issn":["0893-6080","1879-2782"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320990","host_organization_name":"Elsevier BV","host_organization_lineage":["https://openalex.org/P4310320990"],"host_organization_lineage_names":["Elsevier BV"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Neural Networks","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2963163009","https://openalex.org/W3034457371","https://openalex.org/W3137762660","https://openalex.org/W3155594815","https://openalex.org/W4297895153","https://openalex.org/W4386472814","https://openalex.org/W4387609230","https://openalex.org/W4389519173","https://openalex.org/W4400811591"],"related_works":[],"abstract_inverted_index":{"Transformer":[0],"architectures":[1,105],"based":[2],"on":[3,31,142],"the":[4,96,127,143,148,179,195,198,202,214,221],"attention":[5,81],"mechanism":[6,82],"have":[7],"revolutionized":[8],"natural":[9],"language":[10,60],"processing":[11],"(NLP),":[12],"driving":[13],"major":[14],"breakthroughs":[15],"across":[16,201],"virtually":[17],"every":[18],"NLP":[19],"task.":[20],"However,":[21],"their":[22],"substantial":[23],"memory":[24,43,90,138,188],"and":[25,37,78,121,147,164,171,194,220,227],"computational":[26],"requirements":[27],"still":[28],"hinder":[29],"deployment":[30],"ultra-constrained":[32],"devices":[33],"such":[34],"as":[35],"wearables":[36],"Internet-of-Things":[38],"(IoT)":[39],"units,":[40],"where":[41],"available":[42],"is":[44],"limited":[45],"to":[46,126,158,182,190,204,232],"just":[47,191],"a":[48,58,71,135],"few":[49],"megabytes.":[50],"To":[51],"address":[52],"this":[53,93],"challenge,":[54],"we":[55,99,177,208],"introduce":[56],"EmbBERT,":[57],"tiny":[59],"model":[61,69],"(TLM)":[62],"architecturally":[63],"designed":[64],"for":[65,95],"extreme":[66,97],"efficiency.":[67],"The":[68],"integrates":[70],"compact":[72],"embedding":[73],"layer,":[74],"streamlined":[75],"feed-forward":[76],"blocks,":[77],"an":[79,210],"efficient":[80],"that":[83,101,133,152,159],"together":[84],"enable":[85],"optimal":[86],"performance":[87,124],"under":[88,109],"strict":[89],"budgets.":[91],"Through":[92],"redesign":[94],"edge,":[98],"demonstrate":[100,178],"highly":[102],"simplified":[103],"transformer":[104],"remain":[106],"remarkably":[107],"effective":[108],"tight":[110],"resource":[111],"constraints.":[112],"EmbBERT":[113,153,199],"requires":[114],"only":[115],"2":[116],"MB":[117],"of":[118,129,160,169,173,197,217],"total":[119],"memory,":[120],"achieves":[122,154],"accuracy":[123],"comparable":[125,157],"ones":[128],"state-of-the-art":[130],"(SotA)":[131],"models":[132],"require":[134],"10":[136],"\u00d7":[137],"budget.":[139],"Extensive":[140],"experiments":[141],"curated":[144],"TinyNLP":[145],"benchmark":[146],"GLUE":[149],"suite":[150],"confirm":[151],"competitive":[155],"accuracy,":[156],"larger":[161],"SotA":[162],"models,":[163],"consistently":[165],"outperforms":[166],"downsized":[167],"versions":[168],"BERT":[170],"MAMBA":[172],"similar":[174],"size.":[175],"Furthermore,":[176],"model's":[180],"resilience":[181],"8-bit":[183],"quantization,":[184],"which":[185],"further":[186],"reduces":[187],"usage":[189],"781":[192],"kB,":[193],"scalability":[196],"architecture":[200],"sub-megabyte":[203],"tens-of-megabytes":[205],"range.":[206],"Finally,":[207],"perform":[209],"ablation":[211],"study":[212],"demonstrating":[213],"positive":[215],"contributions":[216],"all":[218],"components":[219],"pre-training":[222],"procedure.":[223],"All":[224],"code,":[225],"scripts,":[226],"checkpoints":[228],"are":[229],"publicly":[230],"released":[231],"ensure":[233],"reproducibility:":[234],"https://github.com/RiccardoBravin/tiny-LLM.":[235]},"counts_by_year":[],"updated_date":"2026-03-26T15:22:09.906841","created_date":"2026-02-14T00:00:00"}
