{"id":"https://openalex.org/W2545915677","doi":"https://doi.org/10.1142/s0129626417500062","title":"Accelerating BLAS and LAPACK via Efficient Floating Point Architecture Design","display_name":"Accelerating BLAS and LAPACK via Efficient Floating Point Architecture Design","publication_year":2017,"publication_date":"2017-12-01","ids":{"openalex":"https://openalex.org/W2545915677","doi":"https://doi.org/10.1142/s0129626417500062","mag":"2545915677"},"language":"en","primary_location":{"id":"doi:10.1142/s0129626417500062","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0129626417500062","pdf_url":null,"source":{"id":"https://openalex.org/S18360026","display_name":"Parallel Processing Letters","issn_l":"0129-6264","issn":["0129-6264","1793-642X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319815","host_organization_name":"World Scientific","host_organization_lineage":["https://openalex.org/P4310319815"],"host_organization_lineage_names":["World Scientific"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Parallel Processing Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004454650","display_name":"Farhad Merchant","orcid":"https://orcid.org/0000-0002-3708-5621"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Farhad Merchant","raw_affiliation_strings":["School of Computer Science and Engineering, Nanyang Technological University, Singapore"],"raw_orcid":"https://orcid.org/0000-0002-3708-5621","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089860351","display_name":"Anupam Chattopadhyay","orcid":"https://orcid.org/0000-0002-8818-6983"},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Anupam Chattopadhyay","raw_affiliation_strings":["School of Computer Science and Engineering, Nanyang Technological University, Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Nanyang Technological University, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016258315","display_name":"Soumyendu Raha","orcid":"https://orcid.org/0000-0003-3530-7507"},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Soumyendu Raha","raw_affiliation_strings":["Department of Computational and Data Science, Indian Institute of Science, Bangalore, India 560012, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computational and Data Science, Indian Institute of Science, Bangalore, India 560012, India","institution_ids":["https://openalex.org/I59270414"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112000841","display_name":"S. K. Nandy","orcid":null},"institutions":[{"id":"https://openalex.org/I59270414","display_name":"Indian Institute of Science Bangalore","ror":"https://ror.org/04dese585","country_code":"IN","type":"education","lineage":["https://openalex.org/I59270414"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"S. K. Nandy","raw_affiliation_strings":["Department of Computational and Data Science, Indian Institute of Science, Bangalore, India 560012, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Computational and Data Science, Indian Institute of Science, Bangalore, India 560012, India","institution_ids":["https://openalex.org/I59270414"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102154086","display_name":"Ranjani Narayan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ranjani Narayan","raw_affiliation_strings":["Morphing Machines Pvt. Ltd, India"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Morphing Machines Pvt. Ltd, India","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.3872,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.81414095,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":"27","issue":"03n04","first_page":"1750006","last_page":"1750006"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.9919999837875366,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8367882966995239},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8341829180717468},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.68734210729599},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.6166283488273621},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.5842257738113403},{"id":"https://openalex.org/keywords/double-precision-floating-point-format","display_name":"Double-precision floating-point format","score":0.5731579661369324},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5627242922782898},{"id":"https://openalex.org/keywords/memory-hierarchy","display_name":"Memory hierarchy","score":0.5513970255851746},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.543502926826477},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.5044924020767212},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.4671454131603241},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.46045663952827454},{"id":"https://openalex.org/keywords/graphics-processing-unit","display_name":"Graphics processing unit","score":0.44416117668151855},{"id":"https://openalex.org/keywords/single-precision-floating-point-format","display_name":"Single-precision floating-point format","score":0.4279242753982544},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.21078550815582275},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.15406650304794312},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10563936829566956},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.09091487526893616}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8367882966995239},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8341829180717468},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.68734210729599},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.6166283488273621},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.5842257738113403},{"id":"https://openalex.org/C35912277","wikidata":"https://www.wikidata.org/wiki/Q1243369","display_name":"Double-precision floating-point format","level":3,"score":0.5731579661369324},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5627242922782898},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.5513970255851746},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.543502926826477},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.5044924020767212},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4671454131603241},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.46045663952827454},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.44416117668151855},{"id":"https://openalex.org/C133095886","wikidata":"https://www.wikidata.org/wiki/Q1307173","display_name":"Single-precision floating-point format","level":3,"score":0.4279242753982544},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.21078550815582275},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.15406650304794312},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10563936829566956},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.09091487526893616},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1142/s0129626417500062","is_oa":false,"landing_page_url":"https://doi.org/10.1142/s0129626417500062","pdf_url":null,"source":{"id":"https://openalex.org/S18360026","display_name":"Parallel Processing Letters","issn_l":"0129-6264","issn":["0129-6264","1793-642X"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319815","host_organization_name":"World Scientific","host_organization_lineage":["https://openalex.org/P4310319815"],"host_organization_lineage_names":["World Scientific"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Parallel Processing Letters","raw_type":"journal-article"},{"id":"pmh:oai::73459","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306401429","display_name":"ePrints@IISc (Indian Institute of Science)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I59270414","host_organization_name":"Indian Institute of Science Bangalore","host_organization_lineage":["https://openalex.org/I59270414"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"acceptedVersion","is_accepted":true,"is_published":false,"raw_source_name":"","raw_type":"Journal Article"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/141525","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/141525","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Journal Article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","score":0.4000000059604645,"id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W1989831790","https://openalex.org/W2023965391","https://openalex.org/W2112178927","https://openalex.org/W2125457238","https://openalex.org/W2126148590","https://openalex.org/W4229666556"],"related_works":["https://openalex.org/W2239119680","https://openalex.org/W2963207152","https://openalex.org/W3177218348","https://openalex.org/W2156524298","https://openalex.org/W2065321717","https://openalex.org/W3097256929","https://openalex.org/W17610788","https://openalex.org/W2013660188","https://openalex.org/W1555620806","https://openalex.org/W2172042531"],"abstract_inverted_index":{"Basic":[0],"Linear":[1,6],"Algebra":[2,7],"Subprograms":[3],"(BLAS)":[4],"and":[5,21,41,75,105,124,131,161,174,182,204],"Package":[8],"(LAPACK)":[9],"form":[10],"basic":[11],"building":[12],"blocks":[13],"for":[14,100,111,142],"several":[15,39,135],"High":[16],"Performance":[17,29],"Computing":[18],"(HPC)":[19],"applications":[20],"hence":[22],"dictate":[23],"performance":[24,101,207],"of":[25,38,47,55,60,67,72,77,93,103,114,129,147,154,172,209],"the":[26,51,56,61,64,68,73,78,82,91,94,139,148,164,167],"HPC":[27],"applications.":[28],"in":[30,50,63,81,138,180,186,213],"such":[31,44],"tuned":[32],"packages":[33],"is":[34,159,211],"attained":[35],"through":[36],"tuning":[37,102],"algorithmic":[40],"architectural":[42],"parameters":[43,136],"as":[45],"number":[46],"parallel":[48],"operations":[49,118],"Directed":[52],"Acyclic":[53],"Graph":[54],"BLAS/LAPACK":[57],"routines,":[58],"sizes":[59],"memories":[62],"memory":[65],"hierarchy":[66],"underlying":[69,83],"platform,":[70],"bandwidth":[71],"memory,":[74],"structure":[76],"compute":[79],"resources":[80],"platform.":[84],"In":[85],"this":[86],"paper,":[87],"we":[88],"closely":[89],"investigate":[90],"impact":[92],"Floating":[95],"Point":[96],"Unit":[97,197],"(FPU)":[98],"micro-architecture":[99],"BLAS":[104,130,173],"LAPACK.":[106],"We":[107],"present":[108],"theoretical":[109,140],"analysis":[110],"pipeline":[112,145],"depth":[113,146],"different":[115],"floating":[116,149],"point":[117],"like":[119],"multiplier,":[120],"adder,":[121],"square":[122],"root,":[123],"divider":[125],"followed":[126],"by":[127,176],"characterization":[128],"LAPACK":[132,175],"to":[133,178,184,191],"determine":[134],"required":[137],"framework":[141],"deciding":[143],"optimum":[144],"operations.":[150],"A":[151],"simple":[152],"design":[153],"a":[155],"Processing":[156,196],"Element":[157],"(PE)":[158],"presented":[160],"shown":[162],"that":[163],"PE":[165],"outperforms":[166],"most":[168],"recent":[169],"custom":[170],"realizations":[171],"1.1X":[177],"1.5X":[179],"GFlops/W,":[181],"1.9X":[183],"2.1X":[185],"Gflops/mm":[187],"2":[188],".":[189],"Compared":[190],"multicore,":[192],"General":[193],"Purpose":[194],"Graphics":[195],"(GPGPU),":[198],"Field":[199],"Programmable":[200],"Gate":[201],"Array":[202],"(FPGA),":[203],"ClearSpeed":[205],"CSX700,":[206],"improvement":[208],"1.8-80x":[210],"reported":[212],"PE.":[214]},"counts_by_year":[{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":3},{"year":2018,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
