{"id":"https://openalex.org/W2901549770","doi":"https://doi.org/10.1145/3330345.3331057","title":"Accelerating reduction and scan using tensor core units","display_name":"Accelerating reduction and scan using tensor core units","publication_year":2019,"publication_date":"2019-06-18","ids":{"openalex":"https://openalex.org/W2901549770","doi":"https://doi.org/10.1145/3330345.3331057","mag":"2901549770"},"language":"en","primary_location":{"id":"doi:10.1145/3330345.3331057","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3330345.3331057","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1811.09736","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Abdul Dakkak","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Abdul Dakkak","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Cheng Li","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Cheng Li","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jinjun Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jinjun Xiong","raw_affiliation_strings":["IBM T. J. Watson Research Center"],"affiliations":[{"raw_affiliation_string":"IBM T. J. Watson Research Center","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Isaac Gelado","orcid":null},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Isaac Gelado","raw_affiliation_strings":["NVIDIA Corporation"],"affiliations":[{"raw_affiliation_string":"NVIDIA Corporation","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":null,"display_name":"Wen-mei Hwu","orcid":null},"institutions":[{"id":"https://openalex.org/I157725225","display_name":"University of Illinois Urbana-Champaign","ror":"https://ror.org/047426m28","country_code":"US","type":"education","lineage":["https://openalex.org/I157725225"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Wen-mei Hwu","raw_affiliation_strings":["University of Illinois Urbana-Champaign"],"affiliations":[{"raw_affiliation_string":"University of Illinois Urbana-Champaign","institution_ids":["https://openalex.org/I157725225"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I157725225"],"apc_list":null,"apc_paid":null,"fwci":4.9882,"has_fulltext":false,"cited_by_count":75,"citation_normalized_percentile":{"value":0.96909765,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"46","last_page":"57"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9997000098228455,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.9814000129699707,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.7785000205039978},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6908000111579895},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.6452000141143799},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.6108999848365784},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.607200026512146},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5235000252723694},{"id":"https://openalex.org/keywords/class","display_name":"Class (philosophy)","score":0.39910000562667847},{"id":"https://openalex.org/keywords/power","display_name":"Power (physics)","score":0.3659999966621399}],"concepts":[{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.7785000205039978},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6908000111579895},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.6452000141143799},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.6108999848365784},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.607200026512146},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5825999975204468},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5235000252723694},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5194000005722046},{"id":"https://openalex.org/C2777212361","wikidata":"https://www.wikidata.org/wiki/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.39169999957084656},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.37619999051094055},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.33550000190734863},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.33219999074935913},{"id":"https://openalex.org/C165443888","wikidata":"https://www.wikidata.org/wiki/Q1482183","display_name":"Transformation matrix","level":3,"score":0.3165000081062317},{"id":"https://openalex.org/C56858530","wikidata":"https://www.wikidata.org/wiki/Q15947151","display_name":"Reduction strategy","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.3027999997138977},{"id":"https://openalex.org/C39096654","wikidata":"https://www.wikidata.org/wiki/Q728507","display_name":"Strassen algorithm","level":4,"score":0.28200000524520874},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C2780365336","wikidata":"https://www.wikidata.org/wiki/Q25047934","display_name":"Single-core","level":2,"score":0.2549000084400177},{"id":"https://openalex.org/C3017489831","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Running time","level":2,"score":0.25429999828338623},{"id":"https://openalex.org/C2984118289","wikidata":"https://www.wikidata.org/wiki/Q29954","display_name":"Power consumption","level":3,"score":0.25290000438690186},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.25220000743865967}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3330345.3331057","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3330345.3331057","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM International Conference on Supercomputing","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:1811.09736","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1811.09736","pdf_url":"https://arxiv.org/pdf/1811.09736","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1811.09736","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1811.09736","pdf_url":"https://arxiv.org/pdf/1811.09736","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1452091326","https://openalex.org/W1973959314","https://openalex.org/W1988888548","https://openalex.org/W2017086619","https://openalex.org/W2019708326","https://openalex.org/W2024165284","https://openalex.org/W2033961328","https://openalex.org/W2087977509","https://openalex.org/W2141524575","https://openalex.org/W2199215797","https://openalex.org/W2469604800","https://openalex.org/W2494147643","https://openalex.org/W2508092574","https://openalex.org/W2554770544","https://openalex.org/W2594730095","https://openalex.org/W2604808181","https://openalex.org/W2752862883","https://openalex.org/W2765439756","https://openalex.org/W2780077279","https://openalex.org/W4211087647","https://openalex.org/W4236440911","https://openalex.org/W4248149126","https://openalex.org/W4250027548","https://openalex.org/W4256402776"],"related_works":[],"abstract_inverted_index":{"Driven":[0],"by":[1,195],"deep":[2,45,182],"learning,":[3],"there":[4],"has":[5],"been":[6],"a":[7],"surge":[8],"of":[9,26,85,108,121,148,157],"specialized":[10],"processors":[11],"for":[12,164,169,174,199,203],"matrix":[13,28,67,86],"multiplication,":[14],"referred":[15],"to":[16,41,102,104,118,161,197],"as":[17,65,111],"Tensor":[18],"Core":[19],"Units":[20],"(TCUs).":[21],"These":[22],"TCUs":[23,49,142],"are":[24,50,155],"capable":[25],"performing":[27],"multiplications":[29],"on":[30,69],"small":[31,70,175],"matrices":[32,71],"(usually":[33],"4":[34,36],"\u00d7":[35,39,163,168],"or":[37],"16":[38],"16)":[40],"accelerate":[42],"HPC":[43,180],"and":[44,52,81,89,114,130,136,143,166,181,201],"learning":[46,183],"workloads.":[47],"Although":[48],"prevalent":[51],"promise":[53],"increase":[54],"in":[55,83,124,179],"performance":[56],"and/or":[57],"energy":[58],"efficiency,":[59,129],"they":[60],"suffer":[61],"from":[62],"over":[63],"specialization":[64],"only":[66],"multiplication":[68,87],"is":[72,99,115],"supported.":[73],"In":[74],"this":[75,97,122,188],"paper":[76,98],"we":[77],"express":[78],"both":[79],"reduction":[80,135,165,200],"scan":[82,137],"terms":[84,125],"operations":[88,113],"map":[90],"them":[91],"onto":[92],"TCUs.":[93],"To":[94],"our":[95],"knowledge,":[96],"the":[100,106,116,134,192],"first":[101,117],"try":[103],"broaden":[105],"class":[107],"algorithms":[109,138],"expressible":[110],"TCU":[112],"show":[119],"benefits":[120],"mapping":[123],"of:":[126],"program":[127],"simplicity,":[128],"performance.":[131],"We":[132],"implemented":[133],"using":[139],"NVIDIA's":[140],"V100":[141],"achieved":[144],"89%":[145],"--":[146],"98%":[147],"peak":[149],"memory":[150],"copy":[151],"bandwidth.":[152],"Our":[153,185],"results":[154],"orders":[156],"magnitude":[158],"faster":[159],"(up":[160],"100":[162],"3":[167],"scan)":[170],"than":[171],"state-of-the-art":[172],"methods":[173],"segment":[176],"sizes":[177],"(common":[178],"applications).":[184],"implementation":[186],"achieves":[187],"speedup":[189],"while":[190],"decreasing":[191],"power":[193],"consumption":[194],"up":[196],"22%":[198],"16%":[202],"scan.":[204]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":12},{"year":2024,"cited_by_count":13},{"year":2023,"cited_by_count":12},{"year":2022,"cited_by_count":14},{"year":2021,"cited_by_count":10},{"year":2020,"cited_by_count":10}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2018-11-29T00:00:00"}
