{"id":"https://openalex.org/W4205834599","doi":"https://doi.org/10.1109/tpds.2021.3138856","title":"Accelerating Large Sparse Neural Network Inference using GPU Task Graph Parallelism","display_name":"Accelerating Large Sparse Neural Network Inference using GPU Task Graph Parallelism","publication_year":2021,"publication_date":"2021-01-01","ids":{"openalex":"https://openalex.org/W4205834599","doi":"https://doi.org/10.1109/tpds.2021.3138856"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2021.3138856","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3138856","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5048830943","display_name":"Dian-Lun Lin","orcid":"https://orcid.org/0000-0003-3075-7437"},"institutions":[{"id":"https://openalex.org/I4210122884","display_name":"Utah Department of Health","ror":"https://ror.org/034de1n65","country_code":"US","type":"government","lineage":["https://openalex.org/I4210122884"]},{"id":"https://openalex.org/I223532165","display_name":"University of Utah","ror":"https://ror.org/03r0ha626","country_code":"US","type":"education","lineage":["https://openalex.org/I223532165"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Dian-Lun Lin","raw_affiliation_strings":["Electrical and Computer Engineering, University of Utah Health, 14434 Salt Lake City, Utah, United States, (e-mail: dian-lun.lin@utah.edu)"],"affiliations":[{"raw_affiliation_string":"Electrical and Computer Engineering, University of Utah Health, 14434 Salt Lake City, Utah, United States, (e-mail: dian-lun.lin@utah.edu)","institution_ids":["https://openalex.org/I223532165","https://openalex.org/I4210122884"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5088685794","display_name":"Tsung\u2010Wei Huang","orcid":"https://orcid.org/0000-0001-9768-3378"},"institutions":[{"id":"https://openalex.org/I4210122884","display_name":"Utah Department of Health","ror":"https://ror.org/034de1n65","country_code":"US","type":"government","lineage":["https://openalex.org/I4210122884"]},{"id":"https://openalex.org/I223532165","display_name":"University of Utah","ror":"https://ror.org/03r0ha626","country_code":"US","type":"education","lineage":["https://openalex.org/I223532165"]},{"id":"https://openalex.org/I4210148268","display_name":"University of Utah Health Care","ror":"https://ror.org/047s7ex42","country_code":"US","type":"healthcare","lineage":["https://openalex.org/I4210148268"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tsung-Wei Huang","raw_affiliation_strings":["ECE, University of Utah Health, 14434 Salt Lake City, Utah, United States, 84124 (e-mail: twh760812@gmail.com)"],"affiliations":[{"raw_affiliation_string":"ECE, University of Utah Health, 14434 Salt Lake City, Utah, United States, 84124 (e-mail: twh760812@gmail.com)","institution_ids":["https://openalex.org/I223532165","https://openalex.org/I4210148268","https://openalex.org/I4210122884"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5048830943"],"corresponding_institution_ids":["https://openalex.org/I223532165","https://openalex.org/I4210122884"],"apc_list":null,"apc_paid":null,"fwci":1.441,"has_fulltext":false,"cited_by_count":24,"citation_normalized_percentile":{"value":0.84403595,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"1"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9983999729156494,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8708423376083374},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6853983998298645},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6286256313323975},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6098303198814392},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.519864559173584},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5191603899002075},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.425630658864975},{"id":"https://openalex.org/keywords/sparse-matrix","display_name":"Sparse matrix","score":0.41337308287620544},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.3206663131713867},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2563881278038025},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.2537540793418884}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8708423376083374},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6853983998298645},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6286256313323975},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6098303198814392},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.519864559173584},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5191603899002075},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.425630658864975},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.41337308287620544},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3206663131713867},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2563881278038025},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.2537540793418884},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C163716315","wikidata":"https://www.wikidata.org/wiki/Q901177","display_name":"Gaussian","level":2,"score":0.0},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2021.3138856","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2021.3138856","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4622787617","display_name":null,"funder_award_id":"CCF-2126672","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":40,"referenced_works":["https://openalex.org/W1598866093","https://openalex.org/W2119144962","https://openalex.org/W2124653173","https://openalex.org/W2279098554","https://openalex.org/W2625457103","https://openalex.org/W2802166112","https://openalex.org/W2859678021","https://openalex.org/W2896457183","https://openalex.org/W2899244816","https://openalex.org/W2963416938","https://openalex.org/W2963959650","https://openalex.org/W2964537638","https://openalex.org/W2969388332","https://openalex.org/W2973134322","https://openalex.org/W2989899068","https://openalex.org/W2990514424","https://openalex.org/W2991040477","https://openalex.org/W3008522202","https://openalex.org/W3037847693","https://openalex.org/W3082444099","https://openalex.org/W3100809954","https://openalex.org/W3100839241","https://openalex.org/W3101812505","https://openalex.org/W3113443077","https://openalex.org/W3115410382","https://openalex.org/W3188917597","https://openalex.org/W4294149566","https://openalex.org/W4302343710","https://openalex.org/W6635810480","https://openalex.org/W6677580257","https://openalex.org/W6695314431","https://openalex.org/W6724186685","https://openalex.org/W6742080785","https://openalex.org/W6747766405","https://openalex.org/W6747917562","https://openalex.org/W6751238189","https://openalex.org/W6753584990","https://openalex.org/W6755207826","https://openalex.org/W6756718674","https://openalex.org/W6774619220"],"related_works":["https://openalex.org/W3062287","https://openalex.org/W2380390332","https://openalex.org/W2742145873","https://openalex.org/W4245975140","https://openalex.org/W2062253548","https://openalex.org/W4225414539","https://openalex.org/W4289522463","https://openalex.org/W2052993554","https://openalex.org/W2046125858","https://openalex.org/W1941712353"],"abstract_inverted_index":{"The":[0,63],"ever-increasing":[1],"size":[2],"of":[3,37,105,112,127,143,165,198,214],"modern":[4],"deep":[5],"neural":[6],"network":[7],"(DNN)":[8],"architectures":[9],"has":[10,70,188],"put":[11],"increasing":[12],"strain":[13],"on":[14,139],"the":[15,35,103,140,163,166,184,219],"hardware":[16],"needed":[17],"to":[18,59,73,108,124,158,162,204],"implement":[19],"them.":[20],"Sparsified":[21],"DNNs":[22,45],"can":[23,39,173],"greatly":[24],"reduce":[25],"memory":[26],"costs":[27],"and":[28,61,101,114,122,132,148],"increase":[29],"throughput":[30],"over":[31],"standard":[32],"DNNs,":[33],"if":[34],"loss":[36],"accuracy":[38],"be":[40],"adequately":[41],"controlled.":[42],"However,":[43],"sparse":[44,79,93],"present":[46],"unique":[47],"computational":[48],"challenges.":[49],"Efficient":[50],"model":[51,113,130],"or":[52],"data":[53,115,128],"parallelism":[54],"algorithms":[55],"are":[56],"extremely":[57],"hard":[58],"design":[60],"implement.":[62],"recent":[64],"effort":[65],"MIT/IEEE/Amazon":[66],"HPEC":[67,144,168,224],"Graph":[68],"Challenge":[69,147],"drawn":[71],"attention":[72],"high-performance":[74],"inference":[75,89,99,176],"methods":[76],"for":[77,91],"large":[78,92],"DNNs.":[80,94],"In":[81],"this":[82],"paper,":[83],"we":[84],"introduce":[85],"SNIG,":[86],"an":[87],"efficient":[88,110],"engine":[90],"SNIG":[95,138,172,201,217],"develops":[96],"highly":[97],"optimized":[98],"kernels":[100],"leverages":[102],"power":[104],"CUDA":[106],"Graphs":[107],"enable":[109],"decomposition":[111,118],"parallelisms.":[116],"Our":[117],"strategy":[119],"is":[120,202],"flexible":[121],"scalable":[123,153],"different":[125],"partitions":[126],"volumes,":[129],"sizes,":[131],"GPU":[133,157],"numbers.":[134],"We":[135],"have":[136],"evaluated":[137],"official":[141],"benchmarks":[142],"Sparse":[145,169,225],"DNN":[146,170,226],"demonstrated":[149],"its":[150],"promising":[151],"performance":[152],"from":[154],"a":[155,180,208,212],"single":[156,181],"multiple":[159],"GPUs.":[160,216],"Compared":[161],"champion":[164],"2019":[167],"Challenge,":[171],"finish":[174],"all":[175],"workloads":[177],"using":[178],"only":[179],"GPU.":[182],"At":[183],"largest":[185],"DNN,":[186],"which":[187],"more":[189],"than":[190,207],"4":[191,215],"billion":[192],"parameters":[193],"across":[194],"1920":[195],"layers":[196],"each":[197],"65536":[199],"neurons,":[200],"up":[203],"2.3":[205],"faster":[206],"state-of-the-art":[209],"baseline":[210],"under":[211],"machine":[213],"receives":[218],"Champion":[220],"Award":[221],"in":[222],"2020":[223],"Challenge.":[227]},"counts_by_year":[{"year":2025,"cited_by_count":9},{"year":2024,"cited_by_count":8},{"year":2023,"cited_by_count":4},{"year":2022,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
