{"id":"https://openalex.org/W4417002137","doi":"https://doi.org/10.48550/arxiv.2512.02189","title":"Microbenchmarking NVIDIA's Blackwell Architecture: An in-depth Architectural Analysis","display_name":"Microbenchmarking NVIDIA's Blackwell Architecture: An in-depth Architectural Analysis","publication_year":2025,"publication_date":"2025-12-01","ids":{"openalex":"https://openalex.org/W4417002137","doi":"https://doi.org/10.48550/arxiv.2512.02189"},"language":null,"primary_location":{"id":"pmh:oai:arXiv.org:2512.02189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.02189","pdf_url":"https://arxiv.org/pdf/2512.02189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2512.02189","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047370789","display_name":"Aaron Jarmusch","orcid":"https://orcid.org/0000-0002-5532-6513"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jarmusch, Aaron","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5009614578","display_name":"Sunita Chandrasekaran","orcid":"https://orcid.org/0000-0002-3560-9428"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chandrasekaran, Sunita","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5047370789"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8389000296592712,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8389000296592712,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.041999999433755875,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.011800000444054604,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/suite","display_name":"Suite","score":0.7069000005722046},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.46639999747276306},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.35530000925064087},{"id":"https://openalex.org/keywords/architectural-model","display_name":"Architectural model","score":0.34470000863075256},{"id":"https://openalex.org/keywords/architectural-pattern","display_name":"Architectural pattern","score":0.325300008058548},{"id":"https://openalex.org/keywords/architectural-design","display_name":"Architectural design","score":0.3091000020503998},{"id":"https://openalex.org/keywords/architecture-framework","display_name":"Architecture framework","score":0.3084000051021576}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7599999904632568},{"id":"https://openalex.org/C79581498","wikidata":"https://www.wikidata.org/wiki/Q1367530","display_name":"Suite","level":2,"score":0.7069000005722046},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.46639999747276306},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.42590001225471497},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.35530000925064087},{"id":"https://openalex.org/C2778544944","wikidata":"https://www.wikidata.org/wiki/Q1352349","display_name":"Architectural model","level":3,"score":0.34470000863075256},{"id":"https://openalex.org/C72280835","wikidata":"https://www.wikidata.org/wiki/Q635346","display_name":"Architectural pattern","level":5,"score":0.325300008058548},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.32420000433921814},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3179999887943268},{"id":"https://openalex.org/C2984241579","wikidata":"https://www.wikidata.org/wiki/Q323611","display_name":"Architectural design","level":3,"score":0.3091000020503998},{"id":"https://openalex.org/C53619493","wikidata":"https://www.wikidata.org/wiki/Q4787093","display_name":"Architecture framework","level":3,"score":0.3084000051021576},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2946000099182129},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.289900004863739},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2676999866962433},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C123593499","wikidata":"https://www.wikidata.org/wiki/Q6008583","display_name":"In-Memory Processing","level":5,"score":0.2662000060081482},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.25589999556541443},{"id":"https://openalex.org/C2986737658","wikidata":"https://www.wikidata.org/wiki/Q30103009","display_name":"Tensor decomposition","level":3,"score":0.2554999887943268},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2551000118255615},{"id":"https://openalex.org/C99821215","wikidata":"https://www.wikidata.org/wiki/Q1136583","display_name":"Swap (finance)","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":3,"locations":[{"id":"pmh:oai:arXiv.org:2512.02189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.02189","pdf_url":"https://arxiv.org/pdf/2512.02189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2512.02189","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2512.02189","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2512.02189","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2512.02189","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2512.02189","pdf_url":"https://arxiv.org/pdf/2512.02189","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5600677258","display_name":null,"funder_award_id":"U.S. DOE","funder_id":"https://openalex.org/F4320306084","funder_display_name":"U.S. Department of Energy"}],"funders":[{"id":"https://openalex.org/F4320306084","display_name":"U.S. Department of Energy","ror":"https://ror.org/01bj3aw27"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4417002137.pdf","grobid_xml":"https://content.openalex.org/works/W4417002137.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"GPU":[1,84,99],"architectures":[2],"rapidly":[3],"evolve":[4],"to":[5,75,91,109,115],"meet":[6],"the":[7,16,78,110,116],"growing":[8],"demands":[9],"of":[10,19,82,133],"exascale":[11],"computing":[12],"and":[13,46,96,106,122,135,140,152],"machine":[14],"learning,":[15],"performance":[17],"implications":[18],"architectural":[20,33,94],"innovations":[21],"remain":[22],"poorly":[23],"understood":[24],"across":[25],"diverse":[26],"workloads.":[27],"NVIDIA":[28],"Blackwell":[29,104],"(B200)":[30],"introduces":[31],"significant":[32],"advances,":[34],"including":[35],"fifth-generation":[36],"tensor":[37,39,119,146],"cores,":[38],"memory":[40,117],"(TMEM),":[41],"a":[42,47],"decompression":[43],"engine":[44],"(DE),":[45],"dual-chip":[48],"design;":[49],"however,":[50],"systematic":[51,131],"methodologies":[52],"for":[53],"quantifying":[54],"these":[55],"improvements":[56],"lag":[57],"behind":[58],"hardware":[59],"development":[60],"cycles.":[61],"We":[62,102],"contribute":[63],"an":[64],"open-source":[65],"microbenchmark":[66],"suite":[67],"that":[68,144],"provides":[69],"practical":[70],"insights":[71],"into":[72],"optimizing":[73],"workloads":[74,142],"fully":[76],"utilize":[77],"rich":[79],"feature":[80],"sets":[81],"modern":[83],"architectures.":[85],"This":[86],"work":[87],"enables":[88],"application":[89],"developers":[90],"make":[92],"informed":[93],"decisions":[95],"guides":[97],"future":[98],"design":[100],"directions.":[101],"study":[103],"GPUs":[105],"compare":[107],"them":[108],"H200":[111],"generation":[112],"with":[113,158],"respect":[114],"subsystem,":[118],"core":[120,147],"pipeline,":[121],"floating-point":[123],"precisions":[124],"(FP32,":[125],"FP16,":[126],"FP8,":[127],"FP6,":[128],"FP4).":[129],"Our":[130],"evaluation":[132],"dense":[134],"sparse":[136],"GEMM,":[137],"transformer":[138],"inference,":[139],"training":[141,156],"shows":[143],"B200":[145],"enhancements":[148],"achieve":[149],"1.85x":[150],"ResNet-50":[151],"1.55x":[153],"GPT-1.3B":[154],"mixed-precision":[155],"throughput,":[157],"32":[159],"percent":[160],"better":[161],"energy":[162],"efficiency":[163],"than":[164],"H200.":[165]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-12-04T00:00:00"}
