{"id":"https://openalex.org/W7116306252","doi":"https://doi.org/10.1145/3754598.3754661","title":"Optimizing NumPy with SVE Acceleration on ARM Architectures","display_name":"Optimizing NumPy with SVE Acceleration on ARM Architectures","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116306252","doi":"https://doi.org/10.1145/3754598.3754661"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754661","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754661","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754661","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120887093","display_name":"Kuldeep Pal","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":true,"raw_author_name":"Kuldeep Pal","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0003-1401-4876","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116901275","display_name":"Aniket P. Garade","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Aniket P. Garade","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0002-9676-8796","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120926115","display_name":"Deepika H. V","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Deepika H. V","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0004-7836-7037","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120886129","display_name":"Haribabu P","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Haribabu P","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0000-1727-556X","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120931201","display_name":"S. A. Kumar","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"S. A. Kumar","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0001-3141-077X","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5120919179","display_name":"S. D. Sudarsan","orcid":null},"institutions":[{"id":"https://openalex.org/I1331500379","display_name":"Centre for Development of Advanced Computing","ror":"https://ror.org/022abst40","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1331500379","https://openalex.org/I4210121746"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"S. D. Sudarsan","raw_affiliation_strings":["CDAC, Bengaluru, India"],"raw_orcid":"https://orcid.org/0009-0006-5510-2276","affiliations":[{"raw_affiliation_string":"CDAC, Bengaluru, India","institution_ids":["https://openalex.org/I1331500379"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5120887093"],"corresponding_institution_ids":["https://openalex.org/I1331500379"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.64462725,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"774","last_page":"783"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9477999806404114,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9477999806404114,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.008899999782443047,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.006899999920278788,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6765999794006348},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.5364999771118164},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.4056999981403351},{"id":"https://openalex.org/keywords/upgrade","display_name":"Upgrade","score":0.3783999979496002},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.3587999939918518},{"id":"https://openalex.org/keywords/emulation","display_name":"Emulation","score":0.3400000035762787}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7660999894142151},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6765999794006348},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.5364999771118164},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5268999934196472},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.4056999981403351},{"id":"https://openalex.org/C2780615140","wikidata":"https://www.wikidata.org/wiki/Q920419","display_name":"Upgrade","level":2,"score":0.3783999979496002},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3594000041484833},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3587999939918518},{"id":"https://openalex.org/C149810388","wikidata":"https://www.wikidata.org/wiki/Q5374873","display_name":"Emulation","level":2,"score":0.3400000035762787},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.29989999532699585},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.27639999985694885},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.26019999384880066},{"id":"https://openalex.org/C32900221","wikidata":"https://www.wikidata.org/wiki/Q181365","display_name":"Dot product","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C57691317","wikidata":"https://www.wikidata.org/wiki/Q1289248","display_name":"Scalar (mathematics)","level":2,"score":0.2581999897956848},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.25189998745918274}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754661","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754661","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754661","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754661","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Life in Land","id":"https://metadata.un.org/sdg/15","score":0.40844738483428955}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2084379367","https://openalex.org/W2613264175","https://openalex.org/W2939705846","https://openalex.org/W2995010870","https://openalex.org/W3035965352","https://openalex.org/W3149597153","https://openalex.org/W3215608660","https://openalex.org/W4360995240","https://openalex.org/W4384705465","https://openalex.org/W4385374184","https://openalex.org/W4389491899","https://openalex.org/W4409132226"],"related_works":[],"abstract_inverted_index":{"NumPy,":[0,26],"a":[1,21,65,172,190],"cornerstone":[2],"of":[3,42,55],"Python\u2019s":[4],"scientific":[5,179],"ecosystem,":[6],"often":[7],"encounters":[8],"performance":[9,31,78],"bottlenecks":[10],"on":[11,80,108,183],"cutting-edge":[12],"ARM-based":[13,185],"High-Performance":[14],"Computing":[15],"(HPC)":[16],"architectures.":[17],"This":[18],"paper":[19],"presents":[20],"comprehensive":[22],"optimization":[23],"strategy":[24],"for":[25,29,59,72,175],"meticulously":[27],"designed":[28],"enhancing":[30],"across":[32,86],"modern":[33],"ARM":[34,43],"HPC":[35,186],"architectures,":[36],"by":[37,124],"exploiting":[38],"the":[39,51,81,87,103,165],"full":[40],"potential":[41],"Scalable":[44],"Vector":[45],"Extensions":[46],"(SVE).":[47],"Our":[48,169],"research":[49],"details":[50],"design":[52],"and":[53,70,129,139,160,164,177],"implementation":[54],"hand-tuned":[56],"SVE":[57,153],"kernels":[58],"critical":[60],"NumPy":[61,105,182],"functions,":[62],"integrated":[63],"with":[64],"runtime":[66,166],"SIMD":[67],"dispatch":[68,167],"mechanism":[69],"techniques":[71],"efficient":[73,157,176],"vector":[74,158],"memory":[75,161],"access.":[76],"Extensive":[77],"evaluation":[79],"A64FX":[82],"demonstrates":[83],"significant":[84],"speedups":[85],"board:":[88],"basic":[89],"arithmetic":[90],"operations":[91],"(addition,":[92],"subtraction,":[93],"multiplication)":[94],"achieve":[95],"up":[96,135,142],"to":[97,102,136,143,193],"1300":[98],"\u00d7":[99,138],"acceleration":[100],"compared":[101],"standard":[104],"build":[106],"relying":[107],"compiler":[109],"auto-vectorization":[110],"or":[111],"scalar":[112],"fallbacks,":[113],"vector-vector":[114],"dot":[115,121],"products":[116,122],"reach":[117],"150":[118],"\u00d7,":[119,126],"matrix-vector":[120],"improve":[123],"14":[125],"while":[127],"Blocked":[128],"Batched":[130],"GEMM":[131,141],"variants":[132],"see":[133],"gains":[134,148],"44":[137],"Strassen":[140],"20":[144],"\u00d7.":[145],"These":[146],"substantial":[147],"stem":[149],"from":[150],"carefully":[151],"crafted":[152],"kernels,":[154],"leveraging":[155],"SVE\u2019s":[156],"instructions":[159],"access":[162],"patterns,":[163],"mechanism.":[168],"approach":[170],"sets":[171],"new":[173],"precedent":[174],"scalable":[178],"computing":[180],"using":[181],"next-generation":[184],"systems,":[187],"making":[188],"it":[189],"viable":[191],"alternative":[192],"proprietary":[194],"libraries.":[195]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-12-21T00:00:00"}
