{"id":"https://openalex.org/W4405778707","doi":"https://doi.org/10.1109/tpds.2024.3522776","title":"High Performance Householder QR Factorization on Emerging GPU Architectures Using Tensor Cores","display_name":"High Performance Householder QR Factorization on Emerging GPU Architectures Using Tensor Cores","publication_year":2024,"publication_date":"2024-12-25","ids":{"openalex":"https://openalex.org/W4405778707","doi":"https://doi.org/10.1109/tpds.2024.3522776"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2024.3522776","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3522776","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5033967892","display_name":"Y.J. Leng","orcid":"https://orcid.org/0009-0005-3381-6510"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuhan Leng","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109911685","display_name":"Guangtian Zou","orcid":"https://orcid.org/0000-0003-2349-2039"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gaoyuan Zou","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hansheng Wang","orcid":"https://orcid.org/0009-0006-0035-2323"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hansheng Wang","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China","institution_ids":["https://openalex.org/I150229711"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020822198","display_name":"Panruo Wu","orcid":"https://orcid.org/0000-0003-1859-3580"},"institutions":[{"id":"https://openalex.org/I44461941","display_name":"University of Houston","ror":"https://ror.org/048sx0r50","country_code":"US","type":"education","lineage":["https://openalex.org/I44461941"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Panruo Wu","raw_affiliation_strings":["Department of Computer Science, Univeristy of Houston, Houston, TX, USA"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Univeristy of Houston, Houston, TX, USA","institution_ids":["https://openalex.org/I44461941"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5091290657","display_name":"Shaoshuai Zhang","orcid":"https://orcid.org/0000-0002-9525-1659"},"institutions":[{"id":"https://openalex.org/I150229711","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92","country_code":"CN","type":"education","lineage":["https://openalex.org/I150229711"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaoshuai Zhang","raw_affiliation_strings":["School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, China","institution_ids":["https://openalex.org/I150229711"]},{"raw_affiliation_string":"School of Computer Science and Engineering, University of Electronic Science and Technology of China, Chengdu, Sichuan, P.R.China","institution_ids":["https://openalex.org/I150229711"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5033967892"],"corresponding_institution_ids":["https://openalex.org/I150229711"],"apc_list":null,"apc_paid":null,"fwci":0.6424,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.66937355,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"36","issue":"3","first_page":"422","last_page":"436"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9860000014305115,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.973800003528595,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7395855188369751},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.690292477607727},{"id":"https://openalex.org/keywords/factorization","display_name":"Factorization","score":0.6313770413398743},{"id":"https://openalex.org/keywords/qr-decomposition","display_name":"QR decomposition","score":0.5652797222137451},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.5275275707244873},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5069983005523682},{"id":"https://openalex.org/keywords/matrix-decomposition","display_name":"Matrix decomposition","score":0.43285876512527466},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.36810287833213806},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.34305453300476074},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.21010887622833252},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.14642280340194702},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.08123546838760376},{"id":"https://openalex.org/keywords/geometry","display_name":"Geometry","score":0.07996043562889099}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7395855188369751},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.690292477607727},{"id":"https://openalex.org/C187834632","wikidata":"https://www.wikidata.org/wiki/Q188804","display_name":"Factorization","level":2,"score":0.6313770413398743},{"id":"https://openalex.org/C188060507","wikidata":"https://www.wikidata.org/wiki/Q653242","display_name":"QR decomposition","level":3,"score":0.5652797222137451},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.5275275707244873},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5069983005523682},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.43285876512527466},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.36810287833213806},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.34305453300476074},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.21010887622833252},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.14642280340194702},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.08123546838760376},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.07996043562889099},{"id":"https://openalex.org/C158693339","wikidata":"https://www.wikidata.org/wiki/Q190524","display_name":"Eigenvalues and eigenvectors","level":2,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2024.3522776","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3522776","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.4699999988079071}],"awards":[{"id":"https://openalex.org/G5674811669","display_name":null,"funder_award_id":"A1098531023601465","funder_id":"https://openalex.org/F4320323292","funder_display_name":"University of Electronic Science and Technology of China"}],"funders":[{"id":"https://openalex.org/F4320323292","display_name":"University of Electronic Science and Technology of China","ror":"https://ror.org/04qr3zq92"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":45,"referenced_works":["https://openalex.org/W1490778843","https://openalex.org/W1558296484","https://openalex.org/W1971662204","https://openalex.org/W1982811641","https://openalex.org/W2002555321","https://openalex.org/W2005498993","https://openalex.org/W2073260424","https://openalex.org/W2108634150","https://openalex.org/W2111221242","https://openalex.org/W2119522963","https://openalex.org/W2208150243","https://openalex.org/W2240679564","https://openalex.org/W2529375688","https://openalex.org/W2605943732","https://openalex.org/W2735195241","https://openalex.org/W2765309601","https://openalex.org/W2773871558","https://openalex.org/W2791673912","https://openalex.org/W2895305554","https://openalex.org/W2901549770","https://openalex.org/W2911591717","https://openalex.org/W2980186997","https://openalex.org/W3004853346","https://openalex.org/W3007862902","https://openalex.org/W3009762362","https://openalex.org/W3037775421","https://openalex.org/W3039141850","https://openalex.org/W3043303806","https://openalex.org/W3130554079","https://openalex.org/W3166178281","https://openalex.org/W3204728131","https://openalex.org/W3205226039","https://openalex.org/W4221106024","https://openalex.org/W4221160294","https://openalex.org/W4229666556","https://openalex.org/W4231150350","https://openalex.org/W4283029140","https://openalex.org/W4321446155","https://openalex.org/W4324292875","https://openalex.org/W4385335675","https://openalex.org/W4392884779","https://openalex.org/W6750448596","https://openalex.org/W6784971286","https://openalex.org/W6856460640","https://openalex.org/W6960238849"],"related_works":["https://openalex.org/W2078349096","https://openalex.org/W2002598339","https://openalex.org/W2150953077","https://openalex.org/W1599936522","https://openalex.org/W1973739845","https://openalex.org/W119752240","https://openalex.org/W2006707200","https://openalex.org/W4296368836","https://openalex.org/W3200716635","https://openalex.org/W4388311419"],"abstract_inverted_index":{"Since":[0],"2017,":[1],"NVIDIA":[2],"GPUs":[3],"have":[4,26],"been":[5],"equipped":[6],"with":[7],"specialized":[8],"units":[9],"known":[10],"as":[11,38],"Tensor":[12,32,71,101,120,134],"Cores,":[13,121],"which":[14],"demonstrate":[15],"remarkable":[16],"efficiency":[17],"in":[18,34,45,97,131],"processing":[19],"matrix":[20,35],"multiplications":[21],"(GEMMs).":[22],"Beyond":[23],"GEMMs,":[24,57,95],"researchers":[25],"explored":[27],"the":[28,42,79,85,106,126,132,141,144,165,183],"potential":[29],"applications":[30],"of":[31,128,168],"Cores":[33],"factorization,":[36,137],"such":[37],"QR":[39,46,81,109,136],"factorization.":[40],"However,":[41],"inside":[43],"GEMMs":[44,62,89],"factorization":[47,82,110],"are":[48,63,173],"typically":[49],"tall":[50,59,86],"and":[51,60,87,93,114,118,160,171,178],"skinny.":[52],"Compared":[53],"to":[54,67,90,111,124,143,152,175],"compute-bound":[55],"square":[56,92],"these":[58],"skinny":[61,88],"memory":[64],"bound,":[65],"leading":[66],"suboptimal":[68],"performance":[69,99],"on":[70,100,116,157],"Cores.":[72,102],"To":[73],"solve":[74],"this":[75],"problem,":[76],"we":[77,104,138],"indicate":[78],"recursive":[80],"can":[83],"convert":[84],"relatively":[91],"large":[94],"resulting":[96],"better":[98],"Besides,":[103],"extend":[105],"FP16":[107,117,172],"Tensor-Cores-based":[108],"accommodate":[112],"FP32":[113],"FP64":[115],"INT8":[119],"respectively.":[122],"Additionally,":[123],"address":[125],"issue":[127],"orthogonality":[129],"loss":[130],"preceding":[133],"Cores-based":[135],"transition":[139],"from":[140],"Gram-Schmidt":[142],"Householder":[145],"algorithm":[146],"while":[147],"preserving":[148],"high":[149],"performance.":[150],"According":[151],"our":[153],"experimental":[154],"evaluation":[155],"conducted":[156],"NVIDIA's":[158],"A100":[159],"GeForce":[161],"RTX":[162],"3090":[163],"GPU,":[164],"precision":[166],"levels":[167],"FP64,":[169],"FP32,":[170],"up":[174],"6.22x,":[176],"8.67x,":[177],"4.03x":[179],"faster,":[180],"respectively,":[181],"than":[182],"current":[184],"state-of-the-art":[185],"implementations.":[186]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
