{"id":"https://openalex.org/W4416341617","doi":"https://doi.org/10.1109/socc66126.2025.11235364","title":"SIMAX: a SIMD-Based Many-Core Accelerator for Matrix-Vector Multiplication for Transformers","display_name":"SIMAX: a SIMD-Based Many-Core Accelerator for Matrix-Vector Multiplication for Transformers","publication_year":2025,"publication_date":"2025-09-29","ids":{"openalex":"https://openalex.org/W4416341617","doi":"https://doi.org/10.1109/socc66126.2025.11235364"},"language":null,"primary_location":{"id":"doi:10.1109/socc66126.2025.11235364","is_oa":false,"landing_page_url":"https://doi.org/10.1109/socc66126.2025.11235364","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 38th International System-on-Chip Conference (SOCC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101999025","display_name":"Yue Yang","orcid":"https://orcid.org/0009-0007-9648-073X"},"institutions":[{"id":"https://openalex.org/I67328108","display_name":"California State University, Fresno","ror":"https://ror.org/03enmdz06","country_code":"US","type":"education","lineage":["https://openalex.org/I67328108"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yue Yang","raw_affiliation_strings":["California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA","institution_ids":["https://openalex.org/I67328108"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084569537","display_name":"John Herrera","orcid":null},"institutions":[{"id":"https://openalex.org/I67328108","display_name":"California State University, Fresno","ror":"https://ror.org/03enmdz06","country_code":"US","type":"education","lineage":["https://openalex.org/I67328108"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John Herrera","raw_affiliation_strings":["California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA","institution_ids":["https://openalex.org/I67328108"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5046279911","display_name":"Aaron Stillmaker","orcid":"https://orcid.org/0000-0002-7925-6177"},"institutions":[{"id":"https://openalex.org/I67328108","display_name":"California State University, Fresno","ror":"https://ror.org/03enmdz06","country_code":"US","type":"education","lineage":["https://openalex.org/I67328108"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Aaron Stillmaker","raw_affiliation_strings":["California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"California State University,Electrical and Computer Engineering,Fresno Fresno,CA,USA","institution_ids":["https://openalex.org/I67328108"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34184037,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.40869998931884766,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.40869998931884766,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.2667999863624573,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.05079999938607216,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7139000296592712},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6068999767303467},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.460099995136261},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.43779999017715454},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.4311000108718872},{"id":"https://openalex.org/keywords/footprint","display_name":"Footprint","score":0.41510000824928284},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.3628999888896942}],"concepts":[{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7139000296592712},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6068999767303467},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5882999897003174},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.460099995136261},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.43779999017715454},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.4311000108718872},{"id":"https://openalex.org/C132943942","wikidata":"https://www.wikidata.org/wiki/Q2562511","display_name":"Footprint","level":2,"score":0.41510000824928284},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.3628999888896942},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3580999970436096},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.32010000944137573},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3174000084400177},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.31520000100135803},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.2825999855995178},{"id":"https://openalex.org/C131017901","wikidata":"https://www.wikidata.org/wiki/Q170451","display_name":"Logic gate","level":2,"score":0.28189998865127563},{"id":"https://openalex.org/C24326235","wikidata":"https://www.wikidata.org/wiki/Q126095","display_name":"Electronic engineering","level":1,"score":0.274399995803833},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/socc66126.2025.11235364","is_oa":false,"landing_page_url":"https://doi.org/10.1109/socc66126.2025.11235364","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 38th International System-on-Chip Conference (SOCC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W2625457103","https://openalex.org/W2945146780","https://openalex.org/W3024621361","https://openalex.org/W3123961336","https://openalex.org/W3177265267","https://openalex.org/W4380874786"],"related_works":[],"abstract_inverted_index":{"Matrix-vector":[0],"multiplication":[1],"(MVM)":[2],"is":[3,19,73],"a":[4,35,42,99],"computational":[5],"bottleneck":[6],"for":[7,75],"transformer":[8],"inference":[9,77],"workloads":[10],"at":[11],"resource-restricted":[12],"edge":[13,76],"applications.":[14],"Efficient":[15],"MVM":[16,38],"accelerator":[17,39],"design":[18],"crucial":[20],"to":[21,93,127],"optimizing":[22],"the":[23,109],"footprint":[24],"and":[25,54,68,96,122,130],"energy":[26],"overhead":[27],"while":[28],"maintaining":[29],"accurate":[30],"functionality.":[31],"We":[32],"present":[33],"SIMAX,":[34],"scalable":[36],"INT8":[37],"based":[40],"on":[41],"configurable":[43],"2D":[44],"mesh":[45],"array":[46],"of":[47],"lightweight":[48],"processing":[49],"elements":[50],"with":[51,98],"saturating":[52],"accumulation":[53],"minimal":[55],"control.":[56],"By":[57],"prioritizing":[58],"density":[59],"over":[60,116],"peak":[61],"throughput,":[62],"SIMAX":[63,114],"enables":[64],"more":[65],"on-chip":[66],"memory":[67],"fewer":[69],"off-chip":[70],"transfers,":[71],"which":[72],"beneficial":[74],"workloads.":[78],"Synthesis":[79],"performance":[80],"values":[81],"from":[82,90],"Synopsys":[83],"DC":[84],"were":[85],"scaled":[86],"across":[87],"technology":[88],"nodes":[89],"65":[91],"nm":[92,95,111],"7":[94],"validated":[97],"4":[100,102],"\u00d7":[101],"post-layout":[103],"example":[104],"in":[105],"Cadence":[106],"Innovus":[107],"using":[108],"45":[110],"NanGate":[112],"PDK,":[113],"achieves":[115],"400\u00d7":[117],"speedup,":[118],"85\u00d7":[119],"higher":[120,124],"throughput-per-area,":[121],"6\u00d7":[123],"throughput-per-watt":[125],"compared":[126],"representative":[128],"CPUs":[129],"state-of-the-art":[131],"accelerators.":[132]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-11-17T00:00:00"}
