{"id":"https://openalex.org/W7116313960","doi":"https://doi.org/10.1145/3754598.3754612","title":"Architecture-Aware Models of AI Engines for High-Performance Matrix Matrix Multiplication","display_name":"Architecture-Aware Models of AI Engines for High-Performance Matrix Matrix Multiplication","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116313960","doi":"https://doi.org/10.1145/3754598.3754612"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754612","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5015932389","display_name":"Elliott Binder","orcid":"https://orcid.org/0000-0003-3588-5606"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Elliott D. Binder","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, USA"],"raw_orcid":"https://orcid.org/0000-0003-3588-5606","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120911423","display_name":"Jeffrey Low","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jeffrey Low","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, USA"],"raw_orcid":"https://orcid.org/0009-0006-0872-9172","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5019607600","display_name":"Tze Meng Low","orcid":"https://orcid.org/0000-0002-5179-8249"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tze Meng Low","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, USA"],"raw_orcid":"https://orcid.org/0000-0002-5179-8249","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, USA","institution_ids":["https://openalex.org/I74973139"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5015932389"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64251952,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"531","last_page":"540"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7608000040054321,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.7608000040054321,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.049400001764297485,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.03739999979734421,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.621999979019165},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5730999708175659},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5694000124931335},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.421999990940094},{"id":"https://openalex.org/keywords/data-structure","display_name":"Data structure","score":0.3869999945163727},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.37290000915527344},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.3652999997138977},{"id":"https://openalex.org/keywords/kernel-method","display_name":"Kernel method","score":0.3621000051498413}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7294999957084656},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.621999979019165},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5730999708175659},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5498999953269958},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.44940000772476196},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.421999990940094},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4036000072956085},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.3869999945163727},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.37290000915527344},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.3652999997138977},{"id":"https://openalex.org/C122280245","wikidata":"https://www.wikidata.org/wiki/Q620622","display_name":"Kernel method","level":3,"score":0.3621000051498413},{"id":"https://openalex.org/C80469333","wikidata":"https://www.wikidata.org/wiki/Q189088","display_name":"Von Neumann architecture","level":2,"score":0.3474999964237213},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3422999978065491},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3407999873161316},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3253999948501587},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.30169999599456787},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.28600001335144043},{"id":"https://openalex.org/C153247305","wikidata":"https://www.wikidata.org/wiki/Q835713","display_name":"Memory address","level":3,"score":0.2727999985218048},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.26930001378059387},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C47487241","wikidata":"https://www.wikidata.org/wiki/Q5227230","display_name":"Data access","level":2,"score":0.2524999976158142},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.2515000104904175}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754612","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754612","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G340893225","display_name":null,"funder_award_id":"DE-SC0025645","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"},{"id":"https://openalex.org/G6063729074","display_name":null,"funder_award_id":"HR0011-24-9-0517","funder_id":"https://openalex.org/F4320337531","funder_display_name":"Defense Sciences Office, DARPA"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"},{"id":"https://openalex.org/F4320337531","display_name":"Defense Sciences Office, DARPA","ror":"https://ror.org/0447fe631"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W2043275593","https://openalex.org/W2149381887","https://openalex.org/W2252007067","https://openalex.org/W2412152731","https://openalex.org/W2516525699","https://openalex.org/W2565436413","https://openalex.org/W2737067755","https://openalex.org/W3084619881","https://openalex.org/W3152511518","https://openalex.org/W4230289604","https://openalex.org/W4285503968","https://openalex.org/W4319870545","https://openalex.org/W4391455288","https://openalex.org/W4400489024","https://openalex.org/W4401331008","https://openalex.org/W4407953492","https://openalex.org/W4407953978","https://openalex.org/W4412610626"],"related_works":[],"abstract_inverted_index":{"The":[0],"AI":[1],"Engine":[2],"(AIE)":[3],"architecture,":[4],"available":[5],"in":[6,123,141,153],"systems":[7],"from":[8],"mobile":[9],"SoCs":[10],"to":[11,15,35,68,86,130],"server-class":[12],"FPGAs,":[13],"aims":[14],"efficiently":[16],"execute":[17],"AI/ML":[18],"tasks":[19],"through":[20],"a":[21],"two-dimensional":[22],"array":[23],"of":[24,90,132,147],"compute":[25,43,133,148],"tiles.":[26],"Previous":[27],"work":[28,78],"on":[29,46],"AIEs":[30],"has":[31,49],"explored":[32],"different":[33],"approaches":[34,67],"mapping":[36],"computation":[37],"across":[38],"spatial":[39],"arrays,":[40],"but":[41],"the":[42,52,55,74,88,109,136],"kernel":[44,137],"running":[45],"each":[47],"tile":[48],"not":[50],"been":[51],"focus.":[53],"Additionally,":[54],"AIE-ML":[56,118],"architecture":[57],"introduces":[58],"memory":[59,101,143],"tiles":[60],"and":[61,70,104,144],"omits":[62],"programmable":[63],"logic,":[64],"requiring":[65],"new":[66,96],"staging":[69],"moving":[71],"data":[72,125,139,151],"throughout":[73],"array.":[75],"In":[76],"this":[77],"we":[79],"update":[80],"analytical":[81],"models":[82,115],"developed":[83],"for":[84,120,135],"CPUs":[85],"produce":[87],"design":[89],"high":[91],"performance":[92,128],"kernels":[93,119],"while":[94],"introducing":[95],"model":[97],"considerations":[98],"such":[99],"as":[100,106],"structure,":[102],"throughput,":[103],"latency":[105],"required":[107],"by":[108,116],"AIE":[110],"hardware.":[111],"We":[112],"evaluate":[113],"our":[114],"developing":[117],"matrix":[121],"multiplication":[122],"low-precision":[124],"types":[126],"showing":[127],"up":[129],"95%":[131],"peak":[134,149],"when":[138,150],"resides":[140,152],"local":[142],"above":[145],"90%":[146],"main":[154],"memory.":[155]},"counts_by_year":[],"updated_date":"2026-04-28T14:05:53.105641","created_date":"2025-12-21T00:00:00"}
