{"id":"https://openalex.org/W7119519999","doi":"https://doi.org/10.1145/3773656.3773681","title":"Mixed-precision Interpolative Decomposition on GPUs","display_name":"Mixed-precision Interpolative Decomposition on GPUs","publication_year":2026,"publication_date":"2026-01-09","ids":{"openalex":"https://openalex.org/W7119519999","doi":"https://doi.org/10.1145/3773656.3773681"},"language":null,"primary_location":{"id":"doi:10.1145/3773656.3773681","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773681","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3773656.3773681","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5040574670","display_name":"Qianxiang Ma","orcid":"https://orcid.org/0000-0003-4688-5644"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Qianxiang Ma","raw_affiliation_strings":["Large-scale Parallel Numerical Computing Technology Research Team, RIKEN Center for Computational Science, Kobe, Hyogo, Japan"],"affiliations":[{"raw_affiliation_string":"Large-scale Parallel Numerical Computing Technology Research Team, RIKEN Center for Computational Science, Kobe, Hyogo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]},{"author_position":"last","author":{"id":null,"display_name":"Toshiyuki Imamura","orcid":"https://orcid.org/0000-0003-1601-9710"},"institutions":[{"id":"https://openalex.org/I4210129730","display_name":"RIKEN Center for Computational Science","ror":"https://ror.org/03r519674","country_code":"JP","type":"facility","lineage":["https://openalex.org/I4210110652","https://openalex.org/I4210129730"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Toshiyuki Imamura","raw_affiliation_strings":["Large-scale Parallel Numerical Computing Technology Research Team, RIKEN Center for Computational Science, Kobe, Hyogo, Japan"],"affiliations":[{"raw_affiliation_string":"Large-scale Parallel Numerical Computing Technology Research Team, RIKEN Center for Computational Science, Kobe, Hyogo, Japan","institution_ids":["https://openalex.org/I4210129730"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5040574670"],"corresponding_institution_ids":["https://openalex.org/I4210129730"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.07610635,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"165","last_page":"176"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.5981000065803528,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.5981000065803528,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13487","display_name":"Statistical and numerical algorithms","score":0.08669999986886978,"subfield":{"id":"https://openalex.org/subfields/2604","display_name":"Applied Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.07410000264644623,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cholesky-decomposition","display_name":"Cholesky decomposition","score":0.7050999999046326},{"id":"https://openalex.org/keywords/singular-value-decomposition","display_name":"Singular value decomposition","score":0.6787999868392944},{"id":"https://openalex.org/keywords/qr-decomposition","display_name":"QR decomposition","score":0.5406000018119812},{"id":"https://openalex.org/keywords/matrix-decomposition","display_name":"Matrix decomposition","score":0.5389999747276306},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.48559999465942383},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4196000099182129},{"id":"https://openalex.org/keywords/decomposition","display_name":"Decomposition","score":0.3962000012397766},{"id":"https://openalex.org/keywords/factorization","display_name":"Factorization","score":0.39430001378059387},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.3912999927997589}],"concepts":[{"id":"https://openalex.org/C34727166","wikidata":"https://www.wikidata.org/wiki/Q515375","display_name":"Cholesky decomposition","level":3,"score":0.7050999999046326},{"id":"https://openalex.org/C22789450","wikidata":"https://www.wikidata.org/wiki/Q420904","display_name":"Singular value decomposition","level":2,"score":0.6787999868392944},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6552000045776367},{"id":"https://openalex.org/C188060507","wikidata":"https://www.wikidata.org/wiki/Q653242","display_name":"QR decomposition","level":3,"score":0.5406000018119812},{"id":"https://openalex.org/C42355184","wikidata":"https://www.wikidata.org/wiki/Q1361088","display_name":"Matrix decomposition","level":3,"score":0.5389999747276306},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5340999960899353},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.48559999465942383},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4422000050544739},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4196000099182129},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.3962000012397766},{"id":"https://openalex.org/C187834632","wikidata":"https://www.wikidata.org/wiki/Q188804","display_name":"Factorization","level":2,"score":0.39430001378059387},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.3912999927997589},{"id":"https://openalex.org/C77246614","wikidata":"https://www.wikidata.org/wiki/Q1409400","display_name":"Gramian matrix","level":3,"score":0.36959999799728394},{"id":"https://openalex.org/C137800194","wikidata":"https://www.wikidata.org/wiki/Q11713455","display_name":"Interpolation (computer graphics)","level":3,"score":0.36399999260902405},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.3580999970436096},{"id":"https://openalex.org/C128669082","wikidata":"https://www.wikidata.org/wiki/Q583461","display_name":"Randomized algorithm","level":2,"score":0.33739998936653137},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.3262999951839447},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.31709998846054077},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.30889999866485596},{"id":"https://openalex.org/C109282560","wikidata":"https://www.wikidata.org/wiki/Q4166054","display_name":"Singular value","level":3,"score":0.30880001187324524},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C2777032711","wikidata":"https://www.wikidata.org/wiki/Q5318993","display_name":"Dynamic mode decomposition","level":2,"score":0.296099990606308},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.29170000553131104},{"id":"https://openalex.org/C97137487","wikidata":"https://www.wikidata.org/wiki/Q729138","display_name":"Integer (computer science)","level":2,"score":0.28630000352859497},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.2637999951839447},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.2623000144958496},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.2603999972343445},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2574000060558319},{"id":"https://openalex.org/C7720571","wikidata":"https://www.wikidata.org/wiki/Q1136880","display_name":"Power of two","level":2,"score":0.2533000111579895},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3773656.3773681","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773681","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3773656.3773681","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3773656.3773681","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W36826159","https://openalex.org/W1508675023","https://openalex.org/W1968691112","https://openalex.org/W1987255606","https://openalex.org/W1993482030","https://openalex.org/W2024165284","https://openalex.org/W2038922352","https://openalex.org/W2079232854","https://openalex.org/W2104380208","https://openalex.org/W2114817572","https://openalex.org/W2616932734","https://openalex.org/W2731353277","https://openalex.org/W2786012290","https://openalex.org/W4392884779","https://openalex.org/W4416003993","https://openalex.org/W7110415726"],"related_works":[],"abstract_inverted_index":{"Low-rank":[0],"approximation":[1],"is":[2,66],"a":[3,46,98],"powerful":[4],"tool":[5],"for":[6,49],"reducing":[7],"storage":[8],"and":[9,25,95,122,140,146,166,183],"arithmetic":[10],"complexity":[11],"while":[12,87,126,188],"preserving":[13,127],"linearity.":[14],"Traditional":[15],"rank-revealing":[16],"factorizations,":[17],"such":[18,150],"as":[19,75,77,151],"the":[20,70,163],"singular":[21],"value":[22],"decomposition":[23],"(SVD)":[24],"pivoted":[26],"QR,":[27,125],"have":[28],"been":[29],"adapted":[30],"to":[31],"GPUs":[32],"by":[33],"increasing":[34],"reliance":[35],"on":[36],"GEMM":[37],"kernels,":[38],"yet":[39],"they":[40],"remain":[41],"throughput-limited.":[42],"We":[43],"present":[44],"hyacin,":[45],"GPU":[47,160,186],"implementation":[48],"constructing":[50],"low-rank":[51,106],"approximations":[52,107],"via":[53],"interpolative":[54],"decomposition.":[55],"hyacin":[56,134,173],"combines":[57],"column-pivoted":[58,124],"Cholesky":[59],"QR":[60],"with":[61,145],"extended":[62],"integer":[63],"quantization,":[64],"which":[65],"an":[67],"abstraction":[68],"from":[69],"Ozaki-scheme":[71],"error-free":[72],"floating-point":[73,79],"transformation,":[74],"well":[76],"mixed":[78],"precisions.":[80],"Integer-emulated":[81],"GEMMs":[82],"accelerate":[83],"Gram":[84],"matrix":[85],"formation,":[86],"dynamically":[88],"tuned":[89],"quantization":[90],"orders":[91],"balance":[92],"between":[93],"accuracy":[94,176],"performance.":[96],"On":[97],"single":[99],"NVIDIA":[100],"B200":[101],"GPU,":[102],"our":[103],"method":[104],"computes":[105],"of":[108],"large":[109],"rectangular":[110],"complex":[111],"double-precision":[112],"matrices":[113],"over":[114],"14":[115],"\u00d7":[116,139,142],"faster":[117,143],"than":[118],"cuSOLVER\u2019s":[119,131],"bi-diagonalization":[120],"SVD":[121],"MAGMA\u2019s":[123],"LAPACK-quality":[128],"pivoting.":[129],"Against":[130],"randomized":[132,184],"SVD,":[133],"remains":[135],"consistently":[136],"superior:":[137],"4.5":[138],"1.8":[141],"correspondingly":[144],"without":[147],"robustness":[148],"options":[149],"power":[152],"iterations.":[153],"Comparable":[154],"speedups":[155],"are":[156],"observed":[157],"across":[158],"other":[159],"architectures,":[161],"including":[162],"H100,":[164],"A100,":[165],"RTX":[167],"4090.":[168],"These":[169],"results":[170],"demonstrate":[171],"that":[172],"delivers":[174],"LAPACK-level":[175],"at":[177],"unprecedented":[178],"speed,":[179],"outperforming":[180],"both":[181],"deterministic":[182],"state-of-the-art":[185],"factorizations":[187],"maintaining":[189],"strong":[190],"numerical":[191],"stability.":[192]},"counts_by_year":[],"updated_date":"2026-03-25T23:56:10.502304","created_date":"2026-01-09T00:00:00"}
