{"id":"https://openalex.org/W2133334800","doi":"https://doi.org/10.1145/2304576.2304626","title":"An optimized large-scale hybrid DGEMM design for CPUs and ATI GPUs","display_name":"An optimized large-scale hybrid DGEMM design for CPUs and ATI GPUs","publication_year":2012,"publication_date":"2012-06-25","ids":{"openalex":"https://openalex.org/W2133334800","doi":"https://doi.org/10.1145/2304576.2304626","mag":"2133334800"},"language":"en","primary_location":{"id":"doi:10.1145/2304576.2304626","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304626","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100454305","display_name":"Jiajia Li","orcid":"https://orcid.org/0000-0003-1270-4147"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jiajia Li","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100770663","display_name":"Xingjian Li","orcid":"https://orcid.org/0000-0001-8073-7552"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingjian Li","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104072170","display_name":"Guangming Tan","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangming Tan","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101645579","display_name":"Mingyu Chen","orcid":"https://orcid.org/0000-0003-4469-1037"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingyu Chen","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101738014","display_name":"Ninghui Sun","orcid":"https://orcid.org/0000-0002-4179-2660"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ninghui Sun","raw_affiliation_strings":["Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210090176","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5100454305"],"corresponding_institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210090176"],"apc_list":null,"apc_paid":null,"fwci":0.8783,"has_fulltext":false,"cited_by_count":14,"citation_normalized_percentile":{"value":0.75262301,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"377","last_page":"386"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.992900013923645,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8556625247001648},{"id":"https://openalex.org/keywords/flops","display_name":"FLOPS","score":0.7750974893569946},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7554018497467041},{"id":"https://openalex.org/keywords/x86","display_name":"x86","score":0.7150580286979675},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.5624741315841675},{"id":"https://openalex.org/keywords/double-precision-floating-point-format","display_name":"Double-precision floating-point format","score":0.5211812853813171},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4948257803916931},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.49343886971473694},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.41453680396080017},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3710990250110626},{"id":"https://openalex.org/keywords/floating-point","display_name":"Floating point","score":0.3577020466327667},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.27789437770843506},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.26364219188690186}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8556625247001648},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.7750974893569946},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7554018497467041},{"id":"https://openalex.org/C170723468","wikidata":"https://www.wikidata.org/wiki/Q182933","display_name":"x86","level":3,"score":0.7150580286979675},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.5624741315841675},{"id":"https://openalex.org/C35912277","wikidata":"https://www.wikidata.org/wiki/Q1243369","display_name":"Double-precision floating-point format","level":3,"score":0.5211812853813171},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4948257803916931},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.49343886971473694},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.41453680396080017},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3710990250110626},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.3577020466327667},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.27789437770843506},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.26364219188690186},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/2304576.2304626","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2304576.2304626","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 26th ACM international conference on Supercomputing","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.669.3251","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.669.3251","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://asg.ict.ac.cn/dgemm/ics195_li.pdf?origin%3Dpublication_detail","raw_type":"text"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1863336885","https://openalex.org/W1964477602","https://openalex.org/W1992851788","https://openalex.org/W2002257715","https://openalex.org/W2015935981","https://openalex.org/W2032039029","https://openalex.org/W2063186542","https://openalex.org/W2071148743","https://openalex.org/W2073061372","https://openalex.org/W2099021415","https://openalex.org/W2105524676","https://openalex.org/W2110195531","https://openalex.org/W2118936983","https://openalex.org/W2142421493","https://openalex.org/W2148542244","https://openalex.org/W2153492376","https://openalex.org/W2171473263","https://openalex.org/W4235762625"],"related_works":["https://openalex.org/W3215589575","https://openalex.org/W3150959508","https://openalex.org/W2336476964","https://openalex.org/W4297795876","https://openalex.org/W1571090276","https://openalex.org/W2773283032","https://openalex.org/W2012407419","https://openalex.org/W2239119680","https://openalex.org/W1861262881","https://openalex.org/W2156524298"],"abstract_inverted_index":{"In":[0,37],"heterogeneous":[1,56],"systems":[2],"that":[3,58,116,177],"include":[4],"CPUs":[5,109],"and":[6,72,99,110,138,151,173,184],"GPUs,":[7],"the":[8,20,32,46,65,74,136,146,155,162,174,181,186],"data":[9,78],"transfers":[10,79],"between":[11],"these":[12],"components":[13],"play":[14],"a":[15,27,55,69,81,94,104],"critical":[16],"role":[17],"in":[18,87,93],"determining":[19],"performance":[21,102,156],"of":[22,34,76,157,164],"applications.":[23],"Software":[24],"pipelining":[25],"is":[26],"common":[28],"approach":[29,63,92],"to":[30,68,80,171],"mitigate":[31],"overheads":[33],"those":[35],"transfers.":[36],"this":[38],"paper":[39],"we":[40],"investigate":[41],"advanced":[42],"software-pipelining":[43],"optimizations":[44],"for":[45],"double-precision":[47],"general":[48],"matrix":[49],"multiplication":[50],"(DGEMM)":[51],"algorithm":[52],"running":[53],"on":[54,103,122,148,180,185],"system":[57],"includes":[59],"ATI":[60,112],"GPUs.":[61],"Our":[62,124],"decomposes":[64],"DGEMM":[66,98,160],"workload":[67,147],"finer":[70],"detail":[71],"hides":[73],"latency":[75],"CPU-GPU":[77],"higher":[82],"degree":[83],"than":[84],"previous":[85],"approaches":[86],"literature.":[88],"We":[89,153],"implement":[90],"our":[91,158],"five-stage":[95],"software":[96],"pipelined":[97],"analyze":[100,154],"its":[101],"platform":[105],"including":[106],"x86":[107],"multi-core":[108],"an":[111],"Radeon\u2122":[113],"HD5970":[114],"GPU":[115,120,165],"has":[117],"two":[118],"Cypress":[119],"chips":[121,166],"board.":[123],"implementation":[125],"delivers":[126],"758":[127],"GFLOPS":[128,140],"(82%":[129],"floating-point":[130],"efficiency)":[131,142],"when":[132,143],"it":[133,144],"uses":[134],"only":[135],"GPU,":[137],"844":[139],"(80%":[141],"distributes":[145],"both":[149],"CPU":[150],"GPU.":[152],"optimized":[159],"as":[161],"number":[163],"employed":[167],"grows":[168],"from":[169],"one":[170],"two,":[172],"results":[175],"show":[176],"resource":[178],"contention":[179],"PCIe":[182],"bus":[183],"host":[187],"memory":[188],"are":[189],"limiting":[190],"factors.":[191]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":2},{"year":2019,"cited_by_count":3},{"year":2017,"cited_by_count":2},{"year":2015,"cited_by_count":1},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
