{"id":"https://openalex.org/W1972783048","doi":"https://doi.org/10.1145/2688500.2688521","title":"On optimizing machine learning workloads via kernel fusion","display_name":"On optimizing machine learning workloads via kernel fusion","publication_year":2015,"publication_date":"2015-01-24","ids":{"openalex":"https://openalex.org/W1972783048","doi":"https://doi.org/10.1145/2688500.2688521","mag":"1972783048"},"language":"en","primary_location":{"id":"doi:10.1145/2688500.2688521","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2688500.2688521","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033048139","display_name":"Arash Ashari","orcid":null},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Arash Ashari","raw_affiliation_strings":["Ohio State University, USA","Ohio State University - USA"],"affiliations":[{"raw_affiliation_string":"Ohio State University, USA","institution_ids":["https://openalex.org/I52357470"]},{"raw_affiliation_string":"Ohio State University - USA","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059512590","display_name":"Shirish Tatikonda","orcid":null},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shirish Tatikonda","raw_affiliation_strings":["IBM, USA"],"affiliations":[{"raw_affiliation_string":"IBM, USA","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053153176","display_name":"Matthias B\u00f6ehm","orcid":"https://orcid.org/0000-0003-1344-3663"},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Matthias Boehm","raw_affiliation_strings":["IBM, USA"],"affiliations":[{"raw_affiliation_string":"IBM, USA","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036119025","display_name":"Berthold Reinwald","orcid":null},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Berthold Reinwald","raw_affiliation_strings":["IBM, USA"],"affiliations":[{"raw_affiliation_string":"IBM, USA","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5018925092","display_name":"Keith Campbell","orcid":"https://orcid.org/0000-0003-0974-0788"},"institutions":[{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]},{"id":"https://openalex.org/I4210113654","display_name":"IBM (Canada)","ror":"https://ror.org/025sxka56","country_code":"CA","type":"company","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210113654"]}],"countries":["CA","US"],"is_corresponding":false,"raw_author_name":"Keith Campbell","raw_affiliation_strings":["IBM, Canada","IBM Canada#TAB#"],"affiliations":[{"raw_affiliation_string":"IBM, Canada","institution_ids":["https://openalex.org/I4210113654"]},{"raw_affiliation_string":"IBM Canada#TAB#","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075808779","display_name":"John Keenleyside","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113654","display_name":"IBM (Canada)","ror":"https://ror.org/025sxka56","country_code":"CA","type":"company","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210113654"]},{"id":"https://openalex.org/I1341412227","display_name":"IBM (United States)","ror":"https://ror.org/05hh8d621","country_code":"US","type":"company","lineage":["https://openalex.org/I1341412227"]}],"countries":["CA","US"],"is_corresponding":false,"raw_author_name":"John Keenleyside","raw_affiliation_strings":["IBM, Canada","IBM Canada#TAB#"],"affiliations":[{"raw_affiliation_string":"IBM, Canada","institution_ids":["https://openalex.org/I4210113654"]},{"raw_affiliation_string":"IBM Canada#TAB#","institution_ids":["https://openalex.org/I1341412227"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5027517817","display_name":"P. Sadayappan","orcid":"https://orcid.org/0000-0002-4737-2034"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"P. Sadayappan","raw_affiliation_strings":["Ohio State University, USA","Ohio State University - USA"],"affiliations":[{"raw_affiliation_string":"Ohio State University, USA","institution_ids":["https://openalex.org/I52357470"]},{"raw_affiliation_string":"Ohio State University - USA","institution_ids":["https://openalex.org/I52357470"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5033048139"],"corresponding_institution_ids":["https://openalex.org/I52357470"],"apc_list":null,"apc_paid":null,"fwci":5.4907,"has_fulltext":false,"cited_by_count":40,"citation_normalized_percentile":{"value":0.96107056,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"173","last_page":"182"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9968000054359436,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8114972114562988},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.7412787675857544},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6796369552612305},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.605779230594635},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5457361340522766},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5362040400505066},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5204077959060669},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.4369462728500366},{"id":"https://openalex.org/keywords/range","display_name":"Range (aeronautics)","score":0.41387268900871277},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.38116219639778137},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10808798670768738}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8114972114562988},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.7412787675857544},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6796369552612305},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.605779230594635},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5457361340522766},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5362040400505066},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5204077959060669},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.4369462728500366},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.41387268900871277},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.38116219639778137},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10808798670768738},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.0},{"id":"https://openalex.org/C86339819","wikidata":"https://www.wikidata.org/wiki/Q407384","display_name":"Transcription factor","level":3,"score":0.0},{"id":"https://openalex.org/C158448853","wikidata":"https://www.wikidata.org/wiki/Q425218","display_name":"Repressor","level":4,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2688500.2688521","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2688500.2688521","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 20th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.49000000953674316,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":29,"referenced_works":["https://openalex.org/W191714665","https://openalex.org/W753012316","https://openalex.org/W1528905581","https://openalex.org/W1536816303","https://openalex.org/W1583837637","https://openalex.org/W1762731526","https://openalex.org/W1999702719","https://openalex.org/W2016279572","https://openalex.org/W2083958625","https://openalex.org/W2099102906","https://openalex.org/W2102458936","https://openalex.org/W2120432001","https://openalex.org/W2125621954","https://openalex.org/W2128853364","https://openalex.org/W2138621811","https://openalex.org/W2139410361","https://openalex.org/W2147898188","https://openalex.org/W2152175008","https://openalex.org/W2159350554","https://openalex.org/W2162390675","https://openalex.org/W2167865917","https://openalex.org/W2185864411","https://openalex.org/W2255383145","https://openalex.org/W3138798301","https://openalex.org/W4234561867","https://openalex.org/W6631784147","https://openalex.org/W6680561749","https://openalex.org/W6683722107","https://openalex.org/W6685324724"],"related_works":["https://openalex.org/W1556451512","https://openalex.org/W1555349535","https://openalex.org/W4234091740","https://openalex.org/W4213350282","https://openalex.org/W2230171082","https://openalex.org/W2583128298","https://openalex.org/W2022275305","https://openalex.org/W1604115909","https://openalex.org/W2571250724","https://openalex.org/W1515810787"],"abstract_inverted_index":{"Exploitation":[0],"of":[1,16,46,84,100,147,165,204,224],"parallel":[2],"architectures":[3],"has":[4],"become":[5],"critical":[6],"to":[7,123,133,151,183,199,209],"scalable":[8],"machine":[9],"learning":[10],"(ML).":[11],"Since":[12],"a":[13,28,82,120],"wide":[14],"range":[15],"ML":[17,48,69,93,234],"algorithms":[18],"employ":[19],"linear":[20,57],"algebraic":[21,58],"operators,":[22],"GPUs":[23,128],"with":[24,130],"BLAS":[25],"libraries":[26],"are":[27,37,64,88],"natural":[29],"choice":[30],"for":[31,55,81,187,201],"such":[32],"an":[33,171,232],"exploitation.":[34],"Two":[35],"approaches":[36],"commonly":[38,89],"pursued:":[39],"(i)":[40],"developing":[41,52,68,78],"specific":[42],"GPU":[43,53,181,215],"accelerated":[44,216],"implementations":[45],"complete":[47],"algorithms;":[49],"and":[50,114,137,162,179],"(ii)":[51],"kernels":[54,80,213],"primitive":[56,85],"operators":[59,86],"like":[60,160],"matrix-vector":[61],"multiplication,":[62],"which":[63],"then":[65],"used":[66],"in":[67,91,227],"algorithms.":[70,94],"This":[71,140],"paper":[72],"extends":[73],"the":[74,97,145,205,222,225],"latter":[75],"approach":[76,141,193,226],"by":[77,220],"fused":[79,121],"combination":[83],"that":[87,174],"found":[90],"popular":[92],"We":[95,118,168,218],"identify":[96],"generic":[98,206],"pattern":[99,207],"computation":[101,126],"(alpha":[102],"*":[103,106,108,112],"X^T":[104],"(v":[105],"(X":[107],"y))":[109],"+":[110],"beta":[111],"z)":[113],"its":[115],"various":[116],"instantiations.":[117],"develop":[119],"kernel":[122,188],"optimize":[124],"this":[125],"on":[127,231],"--":[129],"specialized":[131],"techniques":[132],"handle":[134],"both":[135],"sparse":[136],"dense":[138],"matrices.":[139],"not":[142],"only":[143],"reduces":[144],"cost":[146],"data":[148,177],"loads":[149],"due":[150],"improved":[152],"temporal":[153],"locality":[154],"but":[155],"also":[156,169],"enables":[157],"other":[158],"optimizations":[159],"coarsening":[161],"hierarchical":[163],"aggregation":[164],"partial":[166],"results.":[167],"present":[170],"analytical":[172],"model":[173],"considers":[175],"input":[176],"characteristics":[178],"available":[180],"resources":[182],"estimate":[184],"near-optimal":[185],"settings":[186],"launch":[189],"parameters.":[190],"The":[191],"proposed":[192],"provides":[194],"speedups":[195],"ranging":[196],"from":[197],"2":[198],"67":[200],"different":[202],"instances":[203],"compared":[208],"launching":[210],"multiple":[211],"operator-level":[212],"using":[214],"libraries.":[217],"conclude":[219],"demonstrating":[221],"effectiveness":[223],"improving":[228],"end-to-end":[229],"performance":[230],"entire":[233],"algorithm.":[235]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2022,"cited_by_count":5},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":6},{"year":2019,"cited_by_count":5},{"year":2018,"cited_by_count":5},{"year":2017,"cited_by_count":6},{"year":2016,"cited_by_count":5},{"year":2015,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
