{"id":"https://openalex.org/W4386774053","doi":"https://doi.org/10.1145/3624569","title":"PARALiA: A Performance Aware Runtime for Auto-tuning Linear Algebra on Heterogeneous Systems","display_name":"PARALiA: A Performance Aware Runtime for Auto-tuning Linear Algebra on Heterogeneous Systems","publication_year":2023,"publication_date":"2023-09-15","ids":{"openalex":"https://openalex.org/W4386774053","doi":"https://doi.org/10.1145/3624569"},"language":"en","primary_location":{"id":"doi:10.1145/3624569","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3624569","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3624569","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3624569","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5090049842","display_name":"Petros Anastasiadis","orcid":"https://orcid.org/0000-0001-7821-3610"},"institutions":[{"id":"https://openalex.org/I174458059","display_name":"National Technical University of Athens","ror":"https://ror.org/03cx6bg69","country_code":"GR","type":"education","lineage":["https://openalex.org/I174458059"]}],"countries":["GR"],"is_corresponding":true,"raw_author_name":"Petros Anastasiadis","raw_affiliation_strings":["Cslab, National Technical University of Athens, Greece","Computing Systems Laboratory, National Technical University of Athens, Greece"],"raw_orcid":"https://orcid.org/0000-0001-7821-3610","affiliations":[{"raw_affiliation_string":"Cslab, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]},{"raw_affiliation_string":"Computing Systems Laboratory, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057776164","display_name":"\u039d\u03b9\u03ba\u03ad\u03bb\u03b1 \u03a0\u03b1\u03c0\u03b1\u03b4\u03bf\u03c0\u03bf\u03cd\u03bb\u03bf\u03c5","orcid":"https://orcid.org/0000-0003-2141-5654"},"institutions":[{"id":"https://openalex.org/I66862912","display_name":"Chalmers University of Technology","ror":"https://ror.org/040wg7k59","country_code":"SE","type":"education","lineage":["https://openalex.org/I66862912"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Nikela Papadopoulou","raw_affiliation_strings":["Computer Science and Engineering, Chalmers University of Technology, Sweden"],"raw_orcid":"https://orcid.org/0000-0003-2141-5654","affiliations":[{"raw_affiliation_string":"Computer Science and Engineering, Chalmers University of Technology, Sweden","institution_ids":["https://openalex.org/I66862912"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023870696","display_name":"Georgios Goumas","orcid":"https://orcid.org/0000-0001-7811-4831"},"institutions":[{"id":"https://openalex.org/I174458059","display_name":"National Technical University of Athens","ror":"https://ror.org/03cx6bg69","country_code":"GR","type":"education","lineage":["https://openalex.org/I174458059"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Georgios Goumas","raw_affiliation_strings":["Cslab, National Technical University of Athens, Greece","Computing Systems Laboratory, National Technical University of Athens, Greece"],"raw_orcid":"https://orcid.org/0000-0001-7811-4831","affiliations":[{"raw_affiliation_string":"Cslab, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]},{"raw_affiliation_string":"Computing Systems Laboratory, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023526161","display_name":"Nectarios Koziris","orcid":"https://orcid.org/0000-0002-4890-8427"},"institutions":[{"id":"https://openalex.org/I174458059","display_name":"National Technical University of Athens","ror":"https://ror.org/03cx6bg69","country_code":"GR","type":"education","lineage":["https://openalex.org/I174458059"]}],"countries":["GR"],"is_corresponding":false,"raw_author_name":"Nectarios Koziris","raw_affiliation_strings":["Cslab, National Technical University of Athens, Greece","Computing Systems Laboratory, National Technical University of Athens, Greece"],"raw_orcid":"https://orcid.org/0000-0002-4890-8427","affiliations":[{"raw_affiliation_string":"Cslab, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]},{"raw_affiliation_string":"Computing Systems Laboratory, National Technical University of Athens, Greece","institution_ids":["https://openalex.org/I174458059"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026202614","display_name":"Dennis Hoppe","orcid":"https://orcid.org/0000-0001-8976-8603"},"institutions":[{"id":"https://openalex.org/I100066346","display_name":"University of Stuttgart","ror":"https://ror.org/04vnq7t77","country_code":"DE","type":"education","lineage":["https://openalex.org/I100066346"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Dennis Hoppe","raw_affiliation_strings":["HLRS, University of Stuttgart, Germany"],"raw_orcid":"https://orcid.org/0000-0001-8976-8603","affiliations":[{"raw_affiliation_string":"HLRS, University of Stuttgart, Germany","institution_ids":["https://openalex.org/I100066346"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100727367","display_name":"Li Zhong","orcid":"https://orcid.org/0000-0002-6473-1633"},"institutions":[{"id":"https://openalex.org/I100066346","display_name":"University of Stuttgart","ror":"https://ror.org/04vnq7t77","country_code":"DE","type":"education","lineage":["https://openalex.org/I100066346"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Li Zhong","raw_affiliation_strings":["HLRS, University of Stuttgart, Germany"],"raw_orcid":"https://orcid.org/0000-0002-6473-1633","affiliations":[{"raw_affiliation_string":"HLRS, University of Stuttgart, Germany","institution_ids":["https://openalex.org/I100066346"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5090049842"],"corresponding_institution_ids":["https://openalex.org/I174458059"],"apc_list":null,"apc_paid":null,"fwci":0.8812,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.71367652,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":96},"biblio":{"volume":"20","issue":"4","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9023904204368591},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7491135597229004},{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.6354484558105469},{"id":"https://openalex.org/keywords/performance-improvement","display_name":"Performance improvement","score":0.5613467693328857},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5547486543655396},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.549182116985321},{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.5335550308227539},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.529403805732727},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5170857906341553},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5142728686332703},{"id":"https://openalex.org/keywords/adaptability","display_name":"Adaptability","score":0.5036496520042419},{"id":"https://openalex.org/keywords/symmetric-multiprocessor-system","display_name":"Symmetric multiprocessor system","score":0.49370822310447693},{"id":"https://openalex.org/keywords/performance-tuning","display_name":"Performance tuning","score":0.4643927216529846},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4585440456867218},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.4272613525390625},{"id":"https://openalex.org/keywords/runtime-system","display_name":"Runtime system","score":0.4255945086479187},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.20554184913635254},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.15825945138931274},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.1376248598098755}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9023904204368591},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7491135597229004},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.6354484558105469},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.5613467693328857},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5547486543655396},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.549182116985321},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.5335550308227539},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.529403805732727},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5170857906341553},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5142728686332703},{"id":"https://openalex.org/C177606310","wikidata":"https://www.wikidata.org/wiki/Q5674297","display_name":"Adaptability","level":2,"score":0.5036496520042419},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.49370822310447693},{"id":"https://openalex.org/C2777138346","wikidata":"https://www.wikidata.org/wiki/Q1714153","display_name":"Performance tuning","level":2,"score":0.4643927216529846},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4585440456867218},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.4272613525390625},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.4255945086479187},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.20554184913635254},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.15825945138931274},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.1376248598098755},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3624569","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3624569","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3624569","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},{"id":"pmh:oai:eprints.gla.ac.uk:320534","is_oa":true,"landing_page_url":null,"pdf_url":"https://eprints.gla.ac.uk/320534/1/320534.pdf","source":{"id":"https://openalex.org/S4210235606","display_name":"ENLIGHTEN (Jurnal Bimbingan dan Konseling Islam)","issn_l":"2622-8912","issn":["2622-8912","2622-8920"],"is_oa":true,"is_in_doaj":true,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Articles"},{"id":"pmh:oai:research.chalmers.se:539406","is_oa":true,"landing_page_url":"https://research.chalmers.se/en/publication/539406","pdf_url":"https://research.chalmers.se/publication/539406/file/539406_Fulltext.pdf","source":{"id":"https://openalex.org/S4306402469","display_name":"Chalmers Research (Chalmers University of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I66862912","host_organization_name":"Chalmers University of Technology","host_organization_lineage":["https://openalex.org/I66862912"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":""}],"best_oa_location":{"id":"doi:10.1145/3624569","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3624569","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3624569","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.8799999952316284}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386774053.pdf","grobid_xml":"https://content.openalex.org/works/W4386774053.grobid-xml"},"referenced_works_count":22,"referenced_works":["https://openalex.org/W1541729506","https://openalex.org/W1598984030","https://openalex.org/W1964031104","https://openalex.org/W1965942711","https://openalex.org/W1976139981","https://openalex.org/W1984296775","https://openalex.org/W1993508117","https://openalex.org/W2000882268","https://openalex.org/W2100415730","https://openalex.org/W2127589680","https://openalex.org/W2162322364","https://openalex.org/W2169851036","https://openalex.org/W2252007067","https://openalex.org/W2341343020","https://openalex.org/W2516525699","https://openalex.org/W2767028519","https://openalex.org/W2930869794","https://openalex.org/W2952750617","https://openalex.org/W3000071864","https://openalex.org/W3025720970","https://openalex.org/W3159281830","https://openalex.org/W4246869989"],"related_works":["https://openalex.org/W2946050103","https://openalex.org/W2762992606","https://openalex.org/W2785820621","https://openalex.org/W2502873745","https://openalex.org/W153259801","https://openalex.org/W2072208258","https://openalex.org/W1483675448","https://openalex.org/W2169492033","https://openalex.org/W3186309308","https://openalex.org/W4235049746"],"abstract_inverted_index":{"Dense":[0],"linear":[1],"algebra":[2],"operations":[3,29],"appear":[4],"very":[5,85],"frequently":[6,31],"in":[7,65,92,181,205],"high-performance":[8],"computing":[9],"(HPC)":[10],"applications,":[11],"rendering":[12],"their":[13],"performance":[14,94,141,192],"crucial":[15],"to":[16,41,143,170,219],"achieve":[17],"optimal":[18],"scalability.":[19],"As":[20],"many":[21],"modern":[22],"HPC":[23,183],"clusters":[24],"contain":[25],"multi-GPU":[26,46,77],"nodes,":[27],"BLAS":[28,78,156],"are":[30,48],"offloaded":[32],"on":[33],"GPUs,":[34,188],"necessitating":[35],"the":[36,75,118,190,203,213],"use":[37],"of":[38,120,193,215],"optimized":[39,166],"libraries":[40],"ensure":[42],"good":[43],"performance.":[44],"Unfortunately,":[45],"systems":[47],"accompanied":[49],"by":[50,195,200],"two":[51],"significant":[52],"optimization":[53],"challenges:":[54],"data":[55,89,172],"transfer":[56],"bottlenecks":[57],"as":[58,60],"well":[59],"problem":[61,87],"splitting":[62],"and":[63,88,174,197,208,211],"scheduling":[64],"multiple":[66],"workers":[67],"(GPUs)":[68],"with":[69,164,185],"distinct":[70],"memories.":[71],"We":[72,149,178],"demonstrate":[73],"that":[74],"current":[76,115],"methods":[79],"for":[80,96,126],"tackling":[81],"these":[82,133],"challenges":[83],"target":[84],"specific":[86],"characteristics,":[90],"resulting":[91],"serious":[93],"degradation":[95],"any":[97],"slightly":[98],"deviating":[99],"workload.":[100],"Additionally,":[101],"an":[102,154,165,182],"even":[103],"more":[104],"critical":[105],"decision":[106],"is":[107],"omitted":[108],"because":[109],"it":[110],"cannot":[111],"be":[112,124],"addressed":[113],"using":[114,140],"scheduler-based":[116],"approaches:":[117],"determination":[119],"which":[121],"devices":[122],"should":[123],"used":[125],"a":[127,137,206],"certain":[128],"routine":[129],"invocation.":[130],"To":[131],"address":[132],"issues":[134],"we":[135],"propose":[136],"model-based":[138],"approach:":[139],"estimation":[142],"provide":[144],"problem-specific":[145],"autotuning":[146,152,163],"during":[147],"runtime.":[148],"integrate":[150],"this":[151],"into":[153],"end-to-end":[155],"framework":[157,161],"named":[158],"PARALiA.":[159],"This":[160],"couples":[162],"task":[167],"scheduler,":[168],"leading":[169],"near-optimal":[171],"distribution":[173],"performance-aware":[175,217],"resource":[176],"utilization.":[177],"evaluate":[179],"PARALiA":[180],"testbed":[184],"8":[186],"NVIDIA-V100":[187],"improving":[189],"average":[191],"GEMM":[194],"1.7\u00d7":[196],"energy":[198],"efficiency":[199],"2.5\u00d7":[201],"over":[202],"state-of-the-art":[204],"large":[207],"diverse":[209],"dataset":[210],"demonstrating":[212],"adaptability":[214],"our":[216],"approach":[218],"future":[220],"heterogeneous":[221],"systems.":[222]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
