{"id":"https://openalex.org/W3000382190","doi":"https://doi.org/10.1109/tpds.2020.3011893","title":"GPU Tensor Cores for Fast Arithmetic Reductions","display_name":"GPU Tensor Cores for Fast Arithmetic Reductions","publication_year":2020,"publication_date":"2020-07-24","ids":{"openalex":"https://openalex.org/W3000382190","doi":"https://doi.org/10.1109/tpds.2020.3011893","mag":"3000382190"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2020.3011893","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2020.3011893","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2001.05585","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088815725","display_name":"Crist\u00f3bal A. Navarro","orcid":"https://orcid.org/0000-0001-7090-9904"},"institutions":[{"id":"https://openalex.org/I16465266","display_name":"Austral University of Chile","ror":"https://ror.org/029ycp228","country_code":"CL","type":"education","lineage":["https://openalex.org/I16465266"]}],"countries":["CL"],"is_corresponding":false,"raw_author_name":"Cristobal A. Navarro","raw_affiliation_strings":["Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile"],"raw_orcid":"https://orcid.org/0000-0001-7090-9904","affiliations":[{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","institution_ids":["https://openalex.org/I16465266"]},{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile","institution_ids":["https://openalex.org/I16465266"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003936894","display_name":"Roberto Carrasco","orcid":"https://orcid.org/0000-0002-1081-9170"},"institutions":[{"id":"https://openalex.org/I16465266","display_name":"Austral University of Chile","ror":"https://ror.org/029ycp228","country_code":"CL","type":"education","lineage":["https://openalex.org/I16465266"]}],"countries":["CL"],"is_corresponding":false,"raw_author_name":"Roberto Carrasco","raw_affiliation_strings":["Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","institution_ids":["https://openalex.org/I16465266"]},{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile","institution_ids":["https://openalex.org/I16465266"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066947919","display_name":"Ricardo J. Barrientos","orcid":"https://orcid.org/0000-0001-5345-7061"},"institutions":[{"id":"https://openalex.org/I2799535320","display_name":"Catholic University of the Maule","ror":"https://ror.org/04vdpck27","country_code":"CL","type":"education","lineage":["https://openalex.org/I2799535320"]}],"countries":["CL"],"is_corresponding":false,"raw_author_name":"Ricardo J. Barrientos","raw_affiliation_strings":["Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Chile","Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Talca, Chile"],"raw_orcid":"https://orcid.org/0000-0001-5345-7061","affiliations":[{"raw_affiliation_string":"Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Chile","institution_ids":["https://openalex.org/I2799535320"]},{"raw_affiliation_string":"Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Talca, Chile","institution_ids":["https://openalex.org/I2799535320"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5072981539","display_name":"Javier A. Riquelme","orcid":"https://orcid.org/0000-0001-5389-2207"},"institutions":[{"id":"https://openalex.org/I2799535320","display_name":"Catholic University of the Maule","ror":"https://ror.org/04vdpck27","country_code":"CL","type":"education","lineage":["https://openalex.org/I2799535320"]}],"countries":["CL"],"is_corresponding":false,"raw_author_name":"Javier A. Riquelme","raw_affiliation_strings":["Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Chile","Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Talca, Chile"],"raw_orcid":"https://orcid.org/0000-0001-5389-2207","affiliations":[{"raw_affiliation_string":"Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Chile","institution_ids":["https://openalex.org/I2799535320"]},{"raw_affiliation_string":"Laboratory of Technological Research in Pattern Recognition (LITRP), Department of DCI, Faculty of Engineering Science, Universidad Cat\u00f3lica del Maule, San Miguel, Talca, Chile","institution_ids":["https://openalex.org/I2799535320"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5056292437","display_name":"Raimundo Vega","orcid":null},"institutions":[{"id":"https://openalex.org/I16465266","display_name":"Austral University of Chile","ror":"https://ror.org/029ycp228","country_code":"CL","type":"education","lineage":["https://openalex.org/I16465266"]}],"countries":["CL"],"is_corresponding":false,"raw_author_name":"Raimundo Vega","raw_affiliation_strings":["Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Chile","institution_ids":["https://openalex.org/I16465266"]},{"raw_affiliation_string":"Institute of Informatics, Universidad Austral de Chile, Valdivia, Los Ros, Chile","institution_ids":["https://openalex.org/I16465266"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.01774898,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"32","issue":"1","first_page":"72","last_page":"84"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.7767823934555054},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.642421305179596},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6266593337059021},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6146664619445801},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.5378071665763855},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.4836789071559906},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.4752333462238312},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.4101046025753021},{"id":"https://openalex.org/keywords/arithmetic","display_name":"Arithmetic","score":0.3773896396160126},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.3743610680103302},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3397347927093506},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2944241464138031},{"id":"https://openalex.org/keywords/combinatorics","display_name":"Combinatorics","score":0.17410001158714294},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.11394384503364563}],"concepts":[{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.7767823934555054},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.642421305179596},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6266593337059021},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6146664619445801},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.5378071665763855},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.4836789071559906},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.4752333462238312},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.4101046025753021},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.3773896396160126},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3743610680103302},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3397347927093506},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2944241464138031},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.17410001158714294},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.11394384503364563},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"id":"doi:10.1109/tpds.2020.3011893","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2020.3011893","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2001.05585","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2001.05585","pdf_url":"https://arxiv.org/pdf/2001.05585","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},{"id":"mag:3000382190","is_oa":true,"landing_page_url":"https://arxiv.org/pdf/2001.05585.pdf","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"arXiv (Cornell University)","raw_type":null},{"id":"doi:10.48550/arxiv.2001.05585","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2001.05585","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2001.05585","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2001.05585","pdf_url":"https://arxiv.org/pdf/2001.05585","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":null},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W3000382190.pdf","grobid_xml":"https://content.openalex.org/works/W3000382190.grobid-xml"},"referenced_works_count":44,"referenced_works":["https://openalex.org/W1847196819","https://openalex.org/W2006543911","https://openalex.org/W2028499920","https://openalex.org/W2036424138","https://openalex.org/W2050883661","https://openalex.org/W2076039939","https://openalex.org/W2076063813","https://openalex.org/W2129690141","https://openalex.org/W2130566259","https://openalex.org/W2133156997","https://openalex.org/W2137920044","https://openalex.org/W2138782497","https://openalex.org/W2145460710","https://openalex.org/W2152290683","https://openalex.org/W2161190431","https://openalex.org/W2164417656","https://openalex.org/W2170975953","https://openalex.org/W2173213060","https://openalex.org/W2293586629","https://openalex.org/W2585560244","https://openalex.org/W2602975612","https://openalex.org/W2606722458","https://openalex.org/W2737368828","https://openalex.org/W2765439756","https://openalex.org/W2767272762","https://openalex.org/W2790629101","https://openalex.org/W2791673912","https://openalex.org/W2793320545","https://openalex.org/W2796649226","https://openalex.org/W2801748224","https://openalex.org/W2809452033","https://openalex.org/W2810498815","https://openalex.org/W2895162358","https://openalex.org/W2901549770","https://openalex.org/W2907701003","https://openalex.org/W2919115771","https://openalex.org/W2920798074","https://openalex.org/W2953212265","https://openalex.org/W3101543398","https://openalex.org/W3104528661","https://openalex.org/W4256629673","https://openalex.org/W6750448596","https://openalex.org/W6757794950","https://openalex.org/W6760272376"],"related_works":["https://openalex.org/W3044913359","https://openalex.org/W2481847753","https://openalex.org/W3166178281","https://openalex.org/W2747828031","https://openalex.org/W2907795537","https://openalex.org/W2809452033","https://openalex.org/W2945913897","https://openalex.org/W3034986667","https://openalex.org/W2190304840","https://openalex.org/W1951139325","https://openalex.org/W2364719088","https://openalex.org/W3206065062","https://openalex.org/W2995900289","https://openalex.org/W2751658790","https://openalex.org/W1997771347","https://openalex.org/W184625769","https://openalex.org/W2538137543","https://openalex.org/W2990615851","https://openalex.org/W2972461481","https://openalex.org/W2160144917"],"abstract_inverted_index":{"This":[0],"article":[1],"proposes":[2],"a":[3,15,45,58,69,77,100,120,132],"parallel":[4,46,71],"algorithm":[5,43,130],"for":[6,165,174],"computing":[7],"the":[8,41,83,111,129,139,147,178],"arithmetic":[9],"reduction":[10,103],"of":[11,17,34,49,60,128,135,180],"n":[12,56],"numbers":[13],"as":[14],"set":[16],"matrix-multiply":[18],"accumulate":[19],"(MMA)":[20],"operations":[21],"that":[22,40,82,157,176],"are":[23,161],"executed":[24],"simultaneously":[25],"by":[26],"GPU":[27,80,136,158],"tensor":[28,159],"cores.":[29],"The":[30,125,150],"analysis,":[31],"assuming":[32],"tensors":[33],"size":[35],"m":[36],"x":[37],"m,":[38],"shows":[39],"proposed":[42],"has":[44],"running":[47],"time":[48],"T(n)":[50],"=":[51,62],"5log":[52],"<sub":[53,64],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[54,65],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">m2</sub>":[55],"and":[57,90,96,104,138],"speedup":[59],"S":[61],"45log":[63],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">2m</sub>":[66],"2":[67],"over":[68],"canonical":[70],"reduction.":[72,124],"Experimental":[73],"performance":[74],"results":[75,151],"on":[76],"Tesla":[78],"V100":[79],"show":[81,156],"tensor-core":[84],"based":[85],"approach":[86],"is":[87],"energy":[88],"efficient":[89],"runs":[91],"up":[92],"to":[93,119],"~":[94],"3:2\u00d7":[95],"2\u00d7":[97],"faster":[98],"than":[99],"standard":[101],"GPU-based":[102],"Nvidia's":[105],"CUB":[106],"library,":[107],"respectively,":[108],"while":[109],"keeping":[110],"numerical":[112],"error":[113],"below":[114],"1":[115],"percent":[116],"with":[117,146],"respect":[118],"double":[121],"precision":[122],"CPU":[123],"chained":[126],"design":[127],"allows":[131],"flexible":[133],"configuration":[134],"thread-blocks":[137],"optimal":[140],"values":[141],"found":[142],"through":[143],"experimentation":[144],"agree":[145],"theoretical":[148],"ones.":[149],"obtained":[152],"in":[153],"this":[154],"work":[155],"cores":[160],"relevant":[162],"not":[163],"only":[164],"Deep":[166],"Learning":[167],"or":[168],"Linear":[169],"Algebra":[170],"computations,":[171],"but":[172],"also":[173],"applications":[175],"require":[177],"acceleration":[179],"large":[181],"summations.":[182]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
