{"id":"https://openalex.org/W2163687928","doi":"https://doi.org/10.1109/tpds.2016.2549523","title":"Dissecting GPU Memory Hierarchy Through Microbenchmarking","display_name":"Dissecting GPU Memory Hierarchy Through Microbenchmarking","publication_year":2016,"publication_date":"2016-03-31","ids":{"openalex":"https://openalex.org/W2163687928","doi":"https://doi.org/10.1109/tpds.2016.2549523","mag":"2163687928"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2016.2549523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2016.2549523","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5003613956","display_name":"Xinxin Mei","orcid":"https://orcid.org/0000-0003-1046-5269"},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Xinxin Mei","raw_affiliation_strings":["Department of Computer Science, Hong Kong Baptist University, Kowloon, Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Hong Kong Baptist University, Kowloon, Hong Kong","institution_ids":["https://openalex.org/I141568987"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100730785","display_name":"Xiaowen Chu","orcid":"https://orcid.org/0000-0001-9745-4372"},"institutions":[{"id":"https://openalex.org/I141568987","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131","country_code":"HK","type":"education","lineage":["https://openalex.org/I141568987"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Xiaowen Chu","raw_affiliation_strings":["Department of Computer Science, Hong Kong Baptist University, Kowloon, Hong Kong","HKBU Institute of Research and Continuing Education"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, Hong Kong Baptist University, Kowloon, Hong Kong","institution_ids":["https://openalex.org/I141568987"]},{"raw_affiliation_string":"HKBU Institute of Research and Continuing Education","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5003613956"],"corresponding_institution_ids":["https://openalex.org/I141568987"],"apc_list":null,"apc_paid":null,"fwci":31.0376,"has_fulltext":false,"cited_by_count":207,"citation_normalized_percentile":{"value":0.99936283,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"28","issue":"1","first_page":"72","last_page":"86"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9987999796867371,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8909803628921509},{"id":"https://openalex.org/keywords/translation-lookaside-buffer","display_name":"Translation lookaside buffer","score":0.7889519929885864},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7469685077667236},{"id":"https://openalex.org/keywords/memory-hierarchy","display_name":"Memory hierarchy","score":0.7267226576805115},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.6206923127174377},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5792441964149475},{"id":"https://openalex.org/keywords/shared-memory","display_name":"Shared memory","score":0.516762912273407},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.4569641947746277},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.4229954779148102},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.4220421314239502},{"id":"https://openalex.org/keywords/out-of-core-algorithm","display_name":"Out-of-core algorithm","score":0.41034919023513794},{"id":"https://openalex.org/keywords/physical-address","display_name":"Physical address","score":0.2739796042442322},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2263105809688568},{"id":"https://openalex.org/keywords/overlay","display_name":"Overlay","score":0.13376104831695557}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8909803628921509},{"id":"https://openalex.org/C116007543","wikidata":"https://www.wikidata.org/wiki/Q1071403","display_name":"Translation lookaside buffer","level":4,"score":0.7889519929885864},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7469685077667236},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.7267226576805115},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.6206923127174377},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5792441964149475},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.516762912273407},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.4569641947746277},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.4229954779148102},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.4220421314239502},{"id":"https://openalex.org/C79470037","wikidata":"https://www.wikidata.org/wiki/Q279748","display_name":"Out-of-core algorithm","level":2,"score":0.41034919023513794},{"id":"https://openalex.org/C41036726","wikidata":"https://www.wikidata.org/wiki/Q844824","display_name":"Physical address","level":3,"score":0.2739796042442322},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2263105809688568},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.13376104831695557}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/tpds.2016.2549523","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2016.2549523","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-159345","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-159345","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5840791012","display_name":null,"funder_award_id":"210412","funder_id":"https://openalex.org/F4320320955","funder_display_name":"Hong Kong Baptist University"},{"id":"https://openalex.org/G85201831","display_name":null,"funder_award_id":"FRG2/14-15/059","funder_id":"https://openalex.org/F4320320955","funder_display_name":"Hong Kong Baptist University"}],"funders":[{"id":"https://openalex.org/F4320306709","display_name":"Glaucoma Research Foundation","ror":"https://ror.org/05ez53b31"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320320955","display_name":"Hong Kong Baptist University","ror":"https://ror.org/0145fw131"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W192356505","https://openalex.org/W1513924506","https://openalex.org/W1583189038","https://openalex.org/W1992851788","https://openalex.org/W2000335122","https://openalex.org/W2016352575","https://openalex.org/W2020572638","https://openalex.org/W2024122052","https://openalex.org/W2032980190","https://openalex.org/W2054782014","https://openalex.org/W2062527253","https://openalex.org/W2063186542","https://openalex.org/W2084309410","https://openalex.org/W2085467029","https://openalex.org/W2105321788","https://openalex.org/W2110195531","https://openalex.org/W2113190809","https://openalex.org/W2115283682","https://openalex.org/W2117610793","https://openalex.org/W2130336316","https://openalex.org/W2147193503","https://openalex.org/W2150953684","https://openalex.org/W2160875256","https://openalex.org/W2163932369","https://openalex.org/W3141650078","https://openalex.org/W3143589697","https://openalex.org/W4237024478","https://openalex.org/W4239667456","https://openalex.org/W4256231890","https://openalex.org/W6675903601","https://openalex.org/W7065096706"],"related_works":["https://openalex.org/W2350803493","https://openalex.org/W1586753310","https://openalex.org/W2121380786","https://openalex.org/W2114591121","https://openalex.org/W1944865817","https://openalex.org/W2654056874","https://openalex.org/W192356505","https://openalex.org/W4230093848","https://openalex.org/W2951075198","https://openalex.org/W2163687928"],"abstract_inverted_index":{"Memory":[0],"access":[1,96],"efficiency":[2],"is":[3,136],"a":[4,37,109],"key":[5],"factor":[6],"in":[7,155],"fully":[8],"utilizing":[9],"the":[10,22,58,69,78,81,85,93,113,121,130,137,142,151],"computational":[11],"power":[12],"of":[13,21,48,62,71,98,112,126,132,145,153],"graphics":[14],"processing":[15],"units":[16],"(GPUs).":[17],"However,":[18],"many":[19],"details":[20],"GPU":[23,30,73,99,115,127],"memory":[24,64,101,116,157],"hierarchy":[25],"are":[26],"not":[27],"released":[28],"by":[29],"vendors.":[31],"In":[32],"this":[33,135],"paper,":[34],"we":[35,67],"propose":[36],"novel":[38],"fine-grained":[39],"microbenchmarking":[40],"approach":[41],"and":[42,54,84,95,102,124,147,150],"apply":[43],"it":[44],"to":[45,56,140],"three":[46],"generations":[47],"NVIDIA":[49],"GPUs,":[50,149],"namely":[51],"Fermi,":[52],"Kepler,":[53],"Maxwell,":[55],"expose":[57],"previously":[59],"unknown":[60],"characteristics":[61],"their":[63],"hierarchies.":[65],"Specifically,":[66],"investigate":[68,92],"structures":[70],"different":[72],"cache":[74,83,143],"systems,":[75],"such":[76],"as":[77],"data":[79],"cache,":[80],"texture":[82],"translation":[86],"look-aside":[87],"buffer":[88],"(TLB).":[89],"We":[90],"also":[91],"throughput":[94],"latency":[97],"global":[100],"shared":[103,156],"memory.":[104],"Our":[105],"microbenchmark":[106],"results":[107],"offer":[108],"better":[110],"understanding":[111],"mysterious":[114],"hierarchy,":[117],"which":[118],"will":[119],"facilitate":[120],"software":[122],"optimization":[123],"modelling":[125],"architectures.":[128],"To":[129],"best":[131],"our":[133],"knowledge,":[134],"first":[138],"study":[139],"reveal":[141],"properties":[144],"Kepler":[146],"Maxwell":[148,154],"superiority":[152],"performance":[158],"under":[159],"bank":[160],"conflict.":[161]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":17},{"year":2024,"cited_by_count":14},{"year":2023,"cited_by_count":15},{"year":2022,"cited_by_count":14},{"year":2021,"cited_by_count":22},{"year":2020,"cited_by_count":25},{"year":2019,"cited_by_count":32},{"year":2018,"cited_by_count":28},{"year":2017,"cited_by_count":28},{"year":2016,"cited_by_count":9}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
