{"id":"https://openalex.org/W4415987852","doi":"https://doi.org/10.1145/3754598.3754636","title":"Fast and Scalable Mixed Precision Euclidean Distance Calculations Using GPU Tensor Cores","display_name":"Fast and Scalable Mixed Precision Euclidean Distance Calculations Using GPU Tensor Cores","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W4415987852","doi":"https://doi.org/10.1145/3754598.3754636"},"language":"en","primary_location":{"id":"doi:10.1145/3754598.3754636","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754636","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754636","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Brian Curless","orcid":"https://orcid.org/0009-0001-7935-5538"},"institutions":[{"id":"https://openalex.org/I203172682","display_name":"Northern Arizona University","ror":"https://ror.org/0272j5188","country_code":"US","type":"education","lineage":["https://openalex.org/I203172682"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Brian Curless","raw_affiliation_strings":["Northern Arizona University, Flagstaff, USA"],"raw_orcid":"https://orcid.org/0009-0001-7935-5538","affiliations":[{"raw_affiliation_string":"Northern Arizona University, Flagstaff, USA","institution_ids":["https://openalex.org/I203172682"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5075564331","display_name":"Michael Gowanlock","orcid":"https://orcid.org/0000-0002-0826-6204"},"institutions":[{"id":"https://openalex.org/I203172682","display_name":"Northern Arizona University","ror":"https://ror.org/0272j5188","country_code":"US","type":"education","lineage":["https://openalex.org/I203172682"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Michael Gowanlock","raw_affiliation_strings":["Northern Arizona University, Flagstaff, USA"],"raw_orcid":"https://orcid.org/0000-0002-0826-6204","affiliations":[{"raw_affiliation_string":"Northern Arizona University, Flagstaff, USA","institution_ids":["https://openalex.org/I203172682"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I203172682"],"apc_list":null,"apc_paid":null,"fwci":2.8024,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.92598967,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"288","last_page":"298"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.4025000035762787,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.4025000035762787,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.18960000574588776,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.10949999839067459,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/euclidean-distance","display_name":"Euclidean distance","score":0.5813999772071838},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5570999979972839},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.5335999727249146},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5335999727249146},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.529699981212616},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.49729999899864197},{"id":"https://openalex.org/keywords/search-engine-indexing","display_name":"Search engine indexing","score":0.42160001397132874},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.40799999237060547},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.40630000829696655},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.3822999894618988}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7031999826431274},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5856000185012817},{"id":"https://openalex.org/C120174047","wikidata":"https://www.wikidata.org/wiki/Q847073","display_name":"Euclidean distance","level":2,"score":0.5813999772071838},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5570999979972839},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.5335999727249146},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5335999727249146},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.529699981212616},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5249000191688538},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.49729999899864197},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.42160001397132874},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.40799999237060547},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.40630000829696655},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.3822999894618988},{"id":"https://openalex.org/C133095886","wikidata":"https://www.wikidata.org/wiki/Q1307173","display_name":"Single-precision floating-point format","level":3,"score":0.37959998846054077},{"id":"https://openalex.org/C111208986","wikidata":"https://www.wikidata.org/wiki/Q901698","display_name":"Distance matrix","level":2,"score":0.37869998812675476},{"id":"https://openalex.org/C21080849","wikidata":"https://www.wikidata.org/wiki/Q13611879","display_name":"Data point","level":2,"score":0.35440000891685486},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.34940001368522644},{"id":"https://openalex.org/C171558263","wikidata":"https://www.wikidata.org/wiki/Q5406122","display_name":"Euclidean distance matrix","level":3,"score":0.3492000102996826},{"id":"https://openalex.org/C129782007","wikidata":"https://www.wikidata.org/wiki/Q162886","display_name":"Euclidean geometry","level":2,"score":0.3472000062465668},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.3334999978542328},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3294000029563904},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.3240000009536743},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.30880001187324524},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.29580000042915344},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C82668687","wikidata":"https://www.wikidata.org/wiki/Q3046456","display_name":"Earth mover's distance","level":2,"score":0.28870001435279846},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2815999984741211},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.27880001068115234},{"id":"https://openalex.org/C2780365336","wikidata":"https://www.wikidata.org/wiki/Q25047934","display_name":"Single-core","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C35912277","wikidata":"https://www.wikidata.org/wiki/Q1243369","display_name":"Double-precision floating-point format","level":3,"score":0.2732999920845032},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.26669999957084656},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.25760000944137573},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.2572999894618988},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3754598.3754636","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754636","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2508.21230","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.21230","pdf_url":"https://arxiv.org/pdf/2508.21230","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754636","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754636","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G932447174","display_name":null,"funder_award_id":"2042155","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Modern":[0],"GPUs":[1],"are":[2,9,52,107],"equipped":[3],"with":[4],"tensor":[5],"cores":[6],"(TCs)":[7],"that":[8,105,186,200,207],"commonly":[10],"used":[11,53],"for":[12,71,132],"matrix":[13,86],"multiplication":[14,87],"in":[15,33,54,183],"artificial":[16],"intelligence":[17],"workloads.":[18],"However,":[19],"because":[20],"they":[21,26,37],"have":[22],"high":[23,104,126],"computational":[24,127],"throughput,":[25,128],"can":[27,38,75],"lead":[28],"to":[29,46,95,154,167,175,192,242,248],"significant":[30,133],"performance":[31],"gains":[32],"other":[34],"algorithms":[35,199,227],"if":[36],"be":[39,243],"successfully":[40],"exploited.":[41],"We":[42,112,151,173,205,231],"examine":[43],"using":[44,64],"TCs":[45,74,106],"compute":[47],"Euclidean":[48,120,170,180],"distance":[49,171,181],"calculations,":[50],"which":[51,92,160],"many":[55],"data":[56,70,82,137,165],"analytics":[57],"applications.":[58],"Prior":[59],"work":[60],"has":[61],"only":[62],"investigated":[63],"64":[65],"bit":[66,85,90],"floating":[67,80],"point":[68,81],"(FP64)":[69],"computation;":[72],"however,":[73],"operate":[76],"on":[77],"lower":[78],"precision":[79,195,240],"(i.e.,":[83],"16":[84],"and":[88,116,138,149],"32":[89],"accumulation),":[91],"we":[93,129],"refer":[94],"as":[96,189,191],"FP16-32.":[97],"FP16-32":[98],"TC":[99,179],"peak":[100],"throughput":[101],"is":[102],"so":[103],"easily":[108],"starved":[109],"of":[110,136,157,228,237],"data.":[111],"propose":[113],"a":[114,222],"Fast":[115],"Scalable":[117],"Tensor":[118],"core":[119,198],"Distance":[121],"(FaSTED)":[122],"algorithm.":[123],"To":[124],"achieve":[125],"design":[130],"FaSTED":[131,153],"hierarchical":[134],"reuse":[135],"maximize":[139],"memory":[140],"utilization":[141],"at":[142],"every":[143],"level":[144],"(global":[145],"memory,":[146,148],"shared":[147],"registers).":[150],"apply":[152],"the":[155,176,184,216,225,234,249],"application":[156],"similarity":[158],"searches,":[159],"typically":[161],"employ":[162,202],"an":[163,203],"indexing":[164],"structure":[166],"eliminate":[168],"superfluous":[169],"calculations.":[172],"compare":[174],"state-of-the-art":[177],"(SOTA)":[178],"algorithm":[182,241],"literature":[185],"employs":[187],"FP64,":[188],"well":[190],"two":[193],"single":[194],"(FP32)":[196],"CUDA":[197],"both":[201],"index.":[204],"find":[206],"across":[208],"four":[209],"real-world":[210],"high-dimensional":[211],"datasets":[212],"spanning":[213],"128-960":[214],"dimensions,":[215],"mixed-precision":[217],"brute":[218],"force":[219],"approach":[220],"achieves":[221],"speedup":[223],"over":[224],"SOTA":[226],"2.5\u201351":[229],"\u00d7.":[230],"also":[232],"quantify":[233],"accuracy":[235],"loss":[236],"our":[238],"mixed":[239],"<":[244],"0.06%":[245],"when":[246],"compared":[247],"FP64":[250],"baseline.":[251]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2025-12-21T23:12:01.093139","created_date":"2025-10-10T00:00:00"}
