{"id":"https://openalex.org/W2076028817","doi":"https://doi.org/10.1145/2063384.2063458","title":"Extracting ultra-scale Lattice Boltzmann performance via hierarchical and distributed auto-tuning","display_name":"Extracting ultra-scale Lattice Boltzmann performance via hierarchical and distributed auto-tuning","publication_year":2011,"publication_date":"2011-11-08","ids":{"openalex":"https://openalex.org/W2076028817","doi":"https://doi.org/10.1145/2063384.2063458","mag":"2076028817"},"language":"en","primary_location":{"id":"doi:10.1145/2063384.2063458","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063458","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://escholarship.org/content/qt4gf5b5c0/qt4gf5b5c0.pdf?t=ooylf2","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102746800","display_name":"Samuel Williams","orcid":"https://orcid.org/0000-0002-8327-5717"},"institutions":[{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Samuel Williams","raw_affiliation_strings":["Lawrence Berkeley National Laboratory","Computational Research Division, Lawrence Berkeley National Laboratory"],"affiliations":[{"raw_affiliation_string":"Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]},{"raw_affiliation_string":"Computational Research Division, Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113842004","display_name":"Leonid Oliker","orcid":"https://orcid.org/0000-0002-7923-2896"},"institutions":[{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Leonid Oliker","raw_affiliation_strings":["Lawrence Berkeley National Laboratory","Computational Research Division, Lawrence Berkeley National Laboratory"],"affiliations":[{"raw_affiliation_string":"Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]},{"raw_affiliation_string":"Computational Research Division, Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101932170","display_name":"Jonathan Carter","orcid":"https://orcid.org/0000-0001-9006-7636"},"institutions":[{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jonathan Carter","raw_affiliation_strings":["Lawrence Berkeley National Laboratory","Computational Research Division, Lawrence Berkeley National Laboratory"],"affiliations":[{"raw_affiliation_string":"Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]},{"raw_affiliation_string":"Computational Research Division, Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010873686","display_name":"John Shalf","orcid":"https://orcid.org/0000-0002-0608-3690"},"institutions":[{"id":"https://openalex.org/I148283060","display_name":"Lawrence Berkeley National Laboratory","ror":"https://ror.org/02jbv0t02","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I148283060","https://openalex.org/I39565521"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"John Shalf","raw_affiliation_strings":["Lawrence Berkeley National Laboratory","Computational Research Division, Lawrence Berkeley National Laboratory"],"affiliations":[{"raw_affiliation_string":"Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]},{"raw_affiliation_string":"Computational Research Division, Lawrence Berkeley National Laboratory","institution_ids":["https://openalex.org/I148283060"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5102746800"],"corresponding_institution_ids":["https://openalex.org/I148283060"],"apc_list":null,"apc_paid":null,"fwci":5.2816,"has_fulltext":true,"cited_by_count":38,"citation_normalized_percentile":{"value":0.95910781,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"12"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11751","display_name":"Lattice Boltzmann Simulation Studies","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11751","display_name":"Lattice Boltzmann Simulation Studies","score":0.9991000294685364,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9886000156402588,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9883999824523926,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8014321327209473},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7043248414993286},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.6559096574783325},{"id":"https://openalex.org/keywords/posix-threads","display_name":"POSIX Threads","score":0.5934551358222961},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.4972865879535675},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.48233529925346375},{"id":"https://openalex.org/keywords/mimd","display_name":"MIMD","score":0.4776543378829956},{"id":"https://openalex.org/keywords/petascale-computing","display_name":"Petascale computing","score":0.4753408432006836},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.4099574089050293}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8014321327209473},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7043248414993286},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.6559096574783325},{"id":"https://openalex.org/C41138395","wikidata":"https://www.wikidata.org/wiki/Q928112","display_name":"POSIX Threads","level":3,"score":0.5934551358222961},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.4972865879535675},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.48233529925346375},{"id":"https://openalex.org/C21032095","wikidata":"https://www.wikidata.org/wiki/Q1149237","display_name":"MIMD","level":2,"score":0.4776543378829956},{"id":"https://openalex.org/C185410017","wikidata":"https://www.wikidata.org/wiki/Q7171778","display_name":"Petascale computing","level":3,"score":0.4753408432006836},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.4099574089050293},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/2063384.2063458","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063458","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},{"id":"pmh:oai:escholarship.org:ark:/13030/qt4gf5b5c0","is_oa":true,"landing_page_url":"https://escholarship.org/uc/item/4gf5b5c0","pdf_url":"https://escholarship.org/content/qt4gf5b5c0/qt4gf5b5c0.pdf?t=ooylf2","source":{"id":"https://openalex.org/S4306400115","display_name":"eScholarship (California Digital Library)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I2801248553","host_organization_name":"California Digital Library","host_organization_lineage":["https://openalex.org/I2801248553"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"article"},{"id":"pmh:qt4gf5b5c0","is_oa":false,"landing_page_url":"http://www.escholarship.org/uc/item/4gf5b5c0","pdf_url":null,"source":{"id":"https://openalex.org/S4306400115","display_name":"eScholarship (California Digital Library)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I2801248553","host_organization_name":"California Digital Library","host_organization_lineage":["https://openalex.org/I2801248553"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Williams, S; Oliker, L; Carter, J; &amp; Shalf, J. (2011). Extracting ultra-scale lattice boltzmann performance via hierarchical and distributed auto-tuning. Proceedings of 2011 SC - International Conference for High Performance Computing, Networking, Storage and Analysis. doi: 10.1145/2063384.2063458. Lawrence Berkeley National Laboratory: Lawrence Berkeley National Laboratory. Retrieved from: http://www.escholarship.org/uc/item/4gf5b5c0","raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:escholarship.org:ark:/13030/qt4gf5b5c0","is_oa":true,"landing_page_url":"https://escholarship.org/uc/item/4gf5b5c0","pdf_url":"https://escholarship.org/content/qt4gf5b5c0/qt4gf5b5c0.pdf?t=ooylf2","source":{"id":"https://openalex.org/S4306400115","display_name":"eScholarship (California Digital Library)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I2801248553","host_organization_name":"California Digital Library","host_organization_lineage":["https://openalex.org/I2801248553"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.8899999856948853,"display_name":"Affordable and clean energy"}],"awards":[{"id":"https://openalex.org/G3562261287","display_name":null,"funder_award_id":"DE-AC02-05-CH11231","funder_id":"https://openalex.org/F4320337506","funder_display_name":"Advanced Scientific Computing Research"}],"funders":[{"id":"https://openalex.org/F4320337506","display_name":"Advanced Scientific Computing Research","ror":"https://ror.org/0012c7r22"}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2076028817.pdf","grobid_xml":"https://content.openalex.org/works/W2076028817.grobid-xml"},"referenced_works_count":36,"referenced_works":["https://openalex.org/W47921549","https://openalex.org/W151246516","https://openalex.org/W1547391618","https://openalex.org/W1550237464","https://openalex.org/W1964031104","https://openalex.org/W1967316779","https://openalex.org/W1971352798","https://openalex.org/W1971876223","https://openalex.org/W1980037648","https://openalex.org/W1990832096","https://openalex.org/W1993879355","https://openalex.org/W1997147891","https://openalex.org/W2002555321","https://openalex.org/W2007239910","https://openalex.org/W2039378765","https://openalex.org/W2055253125","https://openalex.org/W2059710204","https://openalex.org/W2066060292","https://openalex.org/W2069784446","https://openalex.org/W2096070062","https://openalex.org/W2099625934","https://openalex.org/W2103877122","https://openalex.org/W2109859355","https://openalex.org/W2128746953","https://openalex.org/W2135682468","https://openalex.org/W2139205226","https://openalex.org/W2147127704","https://openalex.org/W2150319905","https://openalex.org/W2151359168","https://openalex.org/W2154786353","https://openalex.org/W2165439482","https://openalex.org/W2622151058","https://openalex.org/W3100299523","https://openalex.org/W4238665603","https://openalex.org/W6643101501","https://openalex.org/W6689920905"],"related_works":["https://openalex.org/W2021702679","https://openalex.org/W3038449658","https://openalex.org/W1582746211","https://openalex.org/W2266027327","https://openalex.org/W2278366184","https://openalex.org/W4206324154","https://openalex.org/W1978931242","https://openalex.org/W2154629969","https://openalex.org/W2745058934","https://openalex.org/W2168240094"],"abstract_inverted_index":{"We":[0],"are":[1],"witnessing":[2],"a":[3,29,37,48,81,120,159,164],"rapid":[4],"evolution":[5],"of":[6,39,75,83,114,122,166],"HPC":[7,171],"node":[8],"architectures":[9],"and":[10,15,73,94,100,134,147],"on-chip":[11],"parallelism":[12],"as":[13,96,98],"power":[14],"cooling":[16],"constraints":[17],"limit":[18],"increases":[19],"in":[20,59],"microprocessor":[21],"clock":[22],"speeds.":[23],"In":[24],"this":[25],"work,":[26],"we":[27,62,79,110],"demonstrate":[28],"hierarchical":[30,116],"approach":[31,144],"towards":[32],"effectively":[33],"extracting":[34],"performance":[35,146],"for":[36,163],"variety":[38,82,121,165],"emerging":[40],"multicore-based":[41],"supercomputing":[42],"platforms.":[43,137],"Our":[44],"examined":[45],"application":[46],"is":[47],"structured":[49],"grid-based":[50],"Lattice":[51],"Boltzmann":[52],"computation":[53],"that":[54,140],"simulates":[55],"homogeneous":[56],"isotropic":[57],"turbulence":[58],"magnetohydrodynamics.":[60],"First,":[61],"examine":[63],"sophisticated":[64],"sequential":[65],"auto-tuning":[66],"techniques":[67,118],"including":[68,87],"loop":[69],"transformations,":[70],"virtual":[71],"vectorization,":[72],"use":[74],"ISA-specific":[76],"intrinsics.":[77],"Next,":[78],"present":[80],"parallel":[84],"optimization":[85,161],"approaches":[86],"programming":[88],"model":[89],"exploration":[90],"(flat":[91],"MPI,":[92],"MPI/OpenMP,":[93],"MPI/Pthreads),":[95],"well":[97],"data":[99],"thread":[101],"decomposition":[102],"strategies":[103],"designed":[104],"to":[105,152],"mitigate":[106],"communication":[107],"bottlenecks.":[108],"Finally,":[109],"evaluate":[111],"the":[112],"impact":[113],"our":[115,141],"tuning":[117,143],"using":[119,154],"problem":[123],"sizes":[124],"via":[125],"large-scale":[126],"simulations":[127],"on":[128,169],"state-of-the-art":[129],"Cray":[130,132],"XT4,":[131],"XE6,":[133],"IBM":[135],"BlueGene/P":[136],"Results":[138],"show":[139],"unique":[142],"improves":[145],"energy":[148],"requirements":[149],"by":[150],"up":[151],"3.4x":[153],"49,152":[155],"cores,":[156],"while":[157],"providing":[158],"portable":[160],"methodology":[162],"numerical":[167],"methods":[168],"forthcoming":[170],"systems.":[172]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":2},{"year":2020,"cited_by_count":1},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":2},{"year":2017,"cited_by_count":3},{"year":2016,"cited_by_count":2},{"year":2015,"cited_by_count":7},{"year":2014,"cited_by_count":4},{"year":2013,"cited_by_count":3},{"year":2012,"cited_by_count":8}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
