{"id":"https://openalex.org/W4414405050","doi":"https://doi.org/10.1109/clusterworkshops65972.2025.11164213","title":"GPU-CPU Shared Memory Performance Analysis on NVIDIA GH200","display_name":"GPU-CPU Shared Memory Performance Analysis on NVIDIA GH200","publication_year":2025,"publication_date":"2025-09-02","ids":{"openalex":"https://openalex.org/W4414405050","doi":"https://doi.org/10.1109/clusterworkshops65972.2025.11164213"},"language":"en","primary_location":{"id":"doi:10.1109/clusterworkshops65972.2025.11164213","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clusterworkshops65972.2025.11164213","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Cluster Computing Workshops (CLUSTER Workshops)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5084940217","display_name":"Norihisa Fujita","orcid":"https://orcid.org/0000-0002-5386-7623"},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Norihisa Fujita","raw_affiliation_strings":["University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020716792","display_name":"Taisuke Boku","orcid":"https://orcid.org/0000-0001-8730-2228"},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Taisuke Boku","raw_affiliation_strings":["University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tomo Yoshida","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Tomo Yoshida","raw_affiliation_strings":["University of Tsukuba,Degree Programs in Systems and Information Engineering,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Degree Programs in Systems and Information Engineering,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Takuto Shirai","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Takuto Shirai","raw_affiliation_strings":["University of Tsukuba,Degree Programs in Systems and Information Engineering,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Degree Programs in Systems and Information Engineering,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5079973660","display_name":"Miwako Tsuji","orcid":"https://orcid.org/0000-0003-4709-1969"},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Miwako Tsuji","raw_affiliation_strings":["University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5084940217"],"corresponding_institution_ids":["https://openalex.org/I146399215"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.28500979,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"2"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.7197999954223633,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.7197999954223633,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6437000036239624,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.5927000045776367,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/uniform-memory-access","display_name":"Uniform memory access","score":0.708299994468689},{"id":"https://openalex.org/keywords/shared-memory","display_name":"Shared memory","score":0.6230999827384949},{"id":"https://openalex.org/keywords/cache-only-memory-architecture","display_name":"Cache-only memory architecture","score":0.6101999878883362},{"id":"https://openalex.org/keywords/memory-map","display_name":"Memory map","score":0.6093999743461609},{"id":"https://openalex.org/keywords/flat-memory-model","display_name":"Flat memory model","score":0.6021999716758728},{"id":"https://openalex.org/keywords/interleaved-memory","display_name":"Interleaved memory","score":0.5925999879837036},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.5228000283241272},{"id":"https://openalex.org/keywords/central-processing-unit","display_name":"Central processing unit","score":0.5052000284194946},{"id":"https://openalex.org/keywords/distributed-shared-memory","display_name":"Distributed shared memory","score":0.4896000027656555},{"id":"https://openalex.org/keywords/non-uniform-memory-access","display_name":"Non-uniform memory access","score":0.487199991941452}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8388000130653381},{"id":"https://openalex.org/C51290061","wikidata":"https://www.wikidata.org/wiki/Q1936765","display_name":"Uniform memory access","level":4,"score":0.708299994468689},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.6230999827384949},{"id":"https://openalex.org/C3720319","wikidata":"https://www.wikidata.org/wiki/Q5015937","display_name":"Cache-only memory architecture","level":5,"score":0.6101999878883362},{"id":"https://openalex.org/C74426580","wikidata":"https://www.wikidata.org/wiki/Q719484","display_name":"Memory map","level":3,"score":0.6093999743461609},{"id":"https://openalex.org/C57863822","wikidata":"https://www.wikidata.org/wiki/Q905488","display_name":"Flat memory model","level":4,"score":0.6021999716758728},{"id":"https://openalex.org/C63511323","wikidata":"https://www.wikidata.org/wiki/Q908936","display_name":"Interleaved memory","level":4,"score":0.5925999879837036},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5357999801635742},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.5228000283241272},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.5052000284194946},{"id":"https://openalex.org/C39528615","wikidata":"https://www.wikidata.org/wiki/Q1229610","display_name":"Distributed shared memory","level":5,"score":0.4896000027656555},{"id":"https://openalex.org/C133371097","wikidata":"https://www.wikidata.org/wiki/Q868014","display_name":"Non-uniform memory access","level":5,"score":0.487199991941452},{"id":"https://openalex.org/C93446704","wikidata":"https://www.wikidata.org/wiki/Q449328","display_name":"Registered memory","level":3,"score":0.47049999237060547},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.424699991941452},{"id":"https://openalex.org/C98986596","wikidata":"https://www.wikidata.org/wiki/Q1143031","display_name":"Semiconductor memory","level":2,"score":0.42399999499320984},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.41260001063346863},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.40720000863075256},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.3919000029563904},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.38519999384880066},{"id":"https://openalex.org/C153247305","wikidata":"https://www.wikidata.org/wiki/Q835713","display_name":"Memory address","level":3,"score":0.3847000002861023},{"id":"https://openalex.org/C41036726","wikidata":"https://www.wikidata.org/wiki/Q844824","display_name":"Physical address","level":3,"score":0.3732999861240387},{"id":"https://openalex.org/C201148951","wikidata":"https://www.wikidata.org/wiki/Q5015976","display_name":"Cache coloring","level":4,"score":0.37310001254081726},{"id":"https://openalex.org/C2779602883","wikidata":"https://www.wikidata.org/wiki/Q15544750","display_name":"Memory architecture","level":2,"score":0.3725999891757965},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.350600004196167},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3476000130176544},{"id":"https://openalex.org/C180613757","wikidata":"https://www.wikidata.org/wiki/Q5013757","display_name":"CPU shielding","level":3,"score":0.3375000059604645},{"id":"https://openalex.org/C171675096","wikidata":"https://www.wikidata.org/wiki/Q1143380","display_name":"Extended memory","level":4,"score":0.3343999981880188},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.32690000534057617},{"id":"https://openalex.org/C76399640","wikidata":"https://www.wikidata.org/wiki/Q189401","display_name":"Virtual memory","level":4,"score":0.32659998536109924},{"id":"https://openalex.org/C144240696","wikidata":"https://www.wikidata.org/wiki/Q367204","display_name":"Address space","level":2,"score":0.30889999866485596},{"id":"https://openalex.org/C92855701","wikidata":"https://www.wikidata.org/wiki/Q5830907","display_name":"Computer memory","level":3,"score":0.29589998722076416},{"id":"https://openalex.org/C18131444","wikidata":"https://www.wikidata.org/wiki/Q163585","display_name":"Memory protection","level":5,"score":0.2906000018119812},{"id":"https://openalex.org/C113166858","wikidata":"https://www.wikidata.org/wiki/Q5015981","display_name":"Cache pollution","level":5,"score":0.2831999957561493},{"id":"https://openalex.org/C53838383","wikidata":"https://www.wikidata.org/wiki/Q541148","display_name":"Conventional memory","level":5,"score":0.2793999910354614},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.27160000801086426},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.27059999108314514},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C82687282","wikidata":"https://www.wikidata.org/wiki/Q66221","display_name":"Auxiliary memory","level":2,"score":0.25920000672340393},{"id":"https://openalex.org/C194080101","wikidata":"https://www.wikidata.org/wiki/Q46306","display_name":"Access time","level":2,"score":0.25529998540878296}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/clusterworkshops65972.2025.11164213","is_oa":false,"landing_page_url":"https://doi.org/10.1109/clusterworkshops65972.2025.11164213","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Cluster Computing Workshops (CLUSTER Workshops)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320323954","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"NVIDIA":[0],"has":[1],"been":[2],"providing":[3],"a":[4,46,53,95,154,190],"feature":[5],"to":[6,93,99,123,141],"share":[7,153],"the":[8,25,38,101,113,172],"memory":[9,57,105,114,125,143,166,181,197],"image":[10],"between":[11,87,106,183],"CPU":[12,89,107,120],"and":[13,42,90,108,116],"GPU":[14,92,109,138],"under":[15],"CUDA":[16],"environment,":[17],"named":[18,85],"UM":[19,35],"(Unified":[20],"Memory)":[21],"so":[22],"far.":[23],"However,":[24],"conventionla":[26],"CPU-GPU":[27,64],"connection":[28],"through":[29,167],"PCIe,":[30],"which":[31,69,170],"is":[32,52,70,75,121,139],"covered":[33],"by":[34,37],"driven":[36],"combination":[39],"of":[40,67,134,149,165,192],"hardware":[41,60,98],"software,":[43],"may":[44],"cause":[45],"serious":[47],"performance":[48,136],"degradation":[49],"because":[50],"it":[51],"virtually":[54],"implemented":[55],"shared":[56,104],"without":[58],"full":[59],"support.":[61],"The":[62],"new":[63],"unified":[65],"module":[66],"GH200":[68],"called":[71],"\u201cNVIDIA":[72],"Grace-Hopper":[73],"Superchip\u201d":[74],"equipped":[76],"with":[77,129,147,195],"an":[78],"advanced":[79],"intra-module":[80],"high":[81],"bandwidth":[82],"communication":[83],"channel":[84],"NVLINK-C2C":[86],"Grace":[88],"Hopper":[91],"provide":[94],"fully":[96],"functionable":[97],"enable":[100],"cache":[102,169],"coherent":[103,168],"[1].":[110],"On":[111],"GH200,":[112],"system":[115,194],"architecture":[117],"are":[118],"complicated.":[119],"attached":[122,140],"LPDR5X":[124],"(\u201cCPU":[126],"memory\u201d":[127,145],"hereafter)":[128,146],"$512":[130],"\\mathrm{~GB}":[131],"/":[132],"\\mathrm{s}$":[133],"peak":[135],"while":[137],"HBM3":[142],"(\u201cGPU":[144],"4TB/s":[148],"peak.":[150],"Two":[151],"devices":[152],"single":[155],"address":[156],"space":[157],"where":[158],"either":[159],"device":[160],"can":[161,186],"access":[162],"another":[163],"side":[164],"makes":[171],"user":[173],"programming":[174],"much":[175],"easier":[176],"than":[177],"traditional":[178],"CUDA-base":[179],"explicit":[180],"copy":[182],"devices.":[184],"It":[185],"be":[187],"understood":[188],"as":[189],"sort":[191],"NUMA":[193],"asymmetrical":[196],"domains.":[198]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}
