{"id":"https://openalex.org/W2924040443","doi":"https://doi.org/10.1145/3330345.3330355","title":"TSM2","display_name":"TSM2","publication_year":2019,"publication_date":"2019-06-18","ids":{"openalex":"https://openalex.org/W2924040443","doi":"https://doi.org/10.1145/3330345.3330355","mag":"2924040443"},"language":"en","primary_location":{"id":"doi:10.1145/3330345.3330355","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3330345.3330355","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3330345.3330355","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3330345.3330355","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063649910","display_name":"Jieyang Chen","orcid":"https://orcid.org/0000-0002-1905-9171"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jieyang Chen","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021540429","display_name":"Nan Xiong","orcid":null},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nan Xiong","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103064768","display_name":"Xin Liang","orcid":"https://orcid.org/0000-0002-0630-1600"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xin Liang","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063703614","display_name":"Dingwen Tao","orcid":"https://orcid.org/0000-0001-5422-4497"},"institutions":[{"id":"https://openalex.org/I17301866","display_name":"University of Alabama","ror":"https://ror.org/03xrrjk67","country_code":"US","type":"education","lineage":["https://openalex.org/I17301866"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dingwen Tao","raw_affiliation_strings":["The University of Alabama"],"affiliations":[{"raw_affiliation_string":"The University of Alabama","institution_ids":["https://openalex.org/I17301866"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103179545","display_name":"Sihuan Li","orcid":"https://orcid.org/0000-0001-7315-7955"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sihuan Li","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051694932","display_name":"Kaiming Ouyang","orcid":"https://orcid.org/0000-0002-4775-1835"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kaiming Ouyang","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100643410","display_name":"Kai Zhao","orcid":"https://orcid.org/0000-0001-5328-3962"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kai Zhao","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056569157","display_name":"Nathan DeBardeleben","orcid":"https://orcid.org/0000-0002-5593-9205"},"institutions":[{"id":"https://openalex.org/I1343871089","display_name":"Los Alamos National Laboratory","ror":"https://ror.org/01e41cf67","country_code":"US","type":"facility","lineage":["https://openalex.org/I1330989302","https://openalex.org/I1343871089","https://openalex.org/I198811213","https://openalex.org/I4210120050"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Nathan DeBardeleben","raw_affiliation_strings":["Los Alamos National Laboratory"],"affiliations":[{"raw_affiliation_string":"Los Alamos National Laboratory","institution_ids":["https://openalex.org/I1343871089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102787759","display_name":"Qiang Guan","orcid":"https://orcid.org/0000-0002-3804-8945"},"institutions":[{"id":"https://openalex.org/I149910238","display_name":"Kent State University","ror":"https://ror.org/049pfb863","country_code":"US","type":"education","lineage":["https://openalex.org/I149910238"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Qiang Guan","raw_affiliation_strings":["Kent State University"],"affiliations":[{"raw_affiliation_string":"Kent State University","institution_ids":["https://openalex.org/I149910238"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061737717","display_name":"Zizhong Chen","orcid":"https://orcid.org/0000-0003-2578-4940"},"institutions":[{"id":"https://openalex.org/I2803209242","display_name":"University of California System","ror":"https://ror.org/00pjdza24","country_code":"US","type":"education","lineage":["https://openalex.org/I2803209242"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zizhong Chen","raw_affiliation_strings":["University of California"],"affiliations":[{"raw_affiliation_string":"University of California","institution_ids":["https://openalex.org/I2803209242"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5063649910"],"corresponding_institution_ids":["https://openalex.org/I2803209242"],"apc_list":null,"apc_paid":null,"fwci":5.2969,"has_fulltext":false,"cited_by_count":39,"citation_normalized_percentile":{"value":0.96334464,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"106","last_page":"116"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9959999918937683,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7911683917045593},{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.7704960107803345},{"id":"https://openalex.org/keywords/pascal","display_name":"Pascal (unit)","score":0.674169659614563},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6208407282829285},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5692800879478455},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5662822723388672},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.5170577168464661},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.4335673451423645},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.4290422201156616},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.28576186299324036},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.13452467322349548},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.0937446653842926}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7911683917045593},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.7704960107803345},{"id":"https://openalex.org/C75608658","wikidata":"https://www.wikidata.org/wiki/Q44395","display_name":"Pascal (unit)","level":2,"score":0.674169659614563},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6208407282829285},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5692800879478455},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5662822723388672},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.5170577168464661},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.4335673451423645},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.4290422201156616},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.28576186299324036},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.13452467322349548},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0937446653842926},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3330345.3330355","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3330345.3330355","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3330345.3330355","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM International Conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3330345.3330355","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3330345.3330355","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3330345.3330355","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6299999952316284,"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2924040443.pdf","grobid_xml":"https://content.openalex.org/works/W2924040443.grobid-xml"},"referenced_works_count":18,"referenced_works":["https://openalex.org/W36826159","https://openalex.org/W1839773802","https://openalex.org/W1964588245","https://openalex.org/W1986007546","https://openalex.org/W2061887253","https://openalex.org/W2083613288","https://openalex.org/W2110195531","https://openalex.org/W2138692126","https://openalex.org/W2162322364","https://openalex.org/W2169150754","https://openalex.org/W2208150243","https://openalex.org/W2412349256","https://openalex.org/W2418331349","https://openalex.org/W2515526980","https://openalex.org/W2534888058","https://openalex.org/W2580243656","https://openalex.org/W2767694495","https://openalex.org/W4233447799"],"related_works":["https://openalex.org/W2293771254","https://openalex.org/W1835670156","https://openalex.org/W4221142455","https://openalex.org/W4381050447","https://openalex.org/W3121828480","https://openalex.org/W3147497457","https://openalex.org/W3121314575","https://openalex.org/W2914631005","https://openalex.org/W2032786851","https://openalex.org/W3004823601"],"abstract_inverted_index":{"Linear":[0],"algebra":[1,22,85],"operations":[2,23,148],"have":[3,16],"been":[4,17],"widely":[5],"used":[6],"in":[7,149],"big":[8],"data":[9],"analytics":[10],"and":[11,55,96,106,128,151,158,163],"scientific":[12],"computations.":[13],"Many":[14],"works":[15,31],"done":[18],"on":[19,24,34,76,82,98],"optimizing":[20,83],"linear":[21,84],"GPUs":[25,77],"with":[26,87,156],"regular-shaped":[27,89],"input.":[28,90],"However,":[29],"few":[30],"are":[32],"focusing":[33],"fully":[35,50],"utilizing":[36,51],"GPU":[37,102],"resources":[38],"when":[39],"the":[40,52,93,114,139,145],"input":[41],"is":[42],"not":[43],"regular-shaped.":[44],"Current":[45],"optimizations":[46],"lack":[47],"of":[48],"considering":[49],"memory":[53,121],"bandwidth":[54,122],"computing":[56,130],"power,":[57],"therefore":[58],"they":[59],"could":[60],"only":[61],"achieve":[62,159],"sub-optimal":[63],"performance.":[64],"In":[65],"this":[66],"paper,":[67],"we":[68],"propose":[69],"a":[70],"performant":[71],"tall-and-skinny":[72],"matrix-matrix":[73],"multiplication":[74],"algorithm":[75,95],"-":[78,118,126,135],"TSM2.":[79],"It":[80],"focuses":[81],"operation":[86],"none":[88],"We":[91,143],"implement":[92],"proposed":[94],"test":[97],"three":[99],"different":[100],"Nvidia":[101],"micro-architectures:":[103],"Kepler,":[104],"Maxwell,":[105],"Pascal.":[107],"Experiments":[108],"show":[109],"that":[110],"our":[111],"TSM2":[112,157],"speedups":[113],"computation":[115],"by":[116,124,133],"1.1x":[117],"3x,":[119],"improves":[120,129],"utilization":[123,132],"8%":[125],"47.6%,":[127],"power":[131],"7%":[134],"37.3%":[136],"comparing":[137],"to":[138,161],"current":[140],"state-of-the-art":[141],"works.":[142],"replace":[144],"original":[146],"matrix":[147],"K-means":[150],"Algorithm-Bases":[152],"Fault":[153],"Tolerance":[154],"(ABFT)":[155],"up":[160],"1.89x":[162],"1.90x":[164],"speed":[165],"up.":[166]},"counts_by_year":[{"year":2025,"cited_by_count":5},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":7},{"year":2022,"cited_by_count":4},{"year":2021,"cited_by_count":9},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2019-04-01T00:00:00"}
