{"id":"https://openalex.org/W4400017463","doi":"https://doi.org/10.1145/3674911","title":"AG-SpTRSV: An Automatic Framework to Optimize Sparse Triangular Solve on GPUs","display_name":"AG-SpTRSV: An Automatic Framework to Optimize Sparse Triangular Solve on GPUs","publication_year":2024,"publication_date":"2024-06-25","ids":{"openalex":"https://openalex.org/W4400017463","doi":"https://doi.org/10.1145/3674911"},"language":"en","primary_location":{"id":"doi:10.1145/3674911","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3674911","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3674911","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3674911","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000331589","display_name":"Zhengding Hu","orcid":"https://orcid.org/0009-0005-8500-6173"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhengding Hu","raw_affiliation_strings":["Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059770282","display_name":"Jingwei Sun","orcid":"https://orcid.org/0000-0001-5098-1503"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingwei Sun","raw_affiliation_strings":["Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114804159","display_name":"Zhongyang Li","orcid":"https://orcid.org/0009-0005-5436-2319"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhongyang Li","raw_affiliation_strings":["Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100932403","display_name":"Guangzhong Sun","orcid":"https://orcid.org/0000-0002-0794-7681"},"institutions":[{"id":"https://openalex.org/I126520041","display_name":"University of Science and Technology of China","ror":"https://ror.org/04c4dkn09","country_code":"CN","type":"education","lineage":["https://openalex.org/I126520041","https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guangzhong Sun","raw_affiliation_strings":["Computer Science and Technology, University of Science and Technology of China, Hefei, China"],"affiliations":[{"raw_affiliation_string":"Computer Science and Technology, University of Science and Technology of China, Hefei, China","institution_ids":["https://openalex.org/I126520041"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5000331589"],"corresponding_institution_ids":["https://openalex.org/I126520041"],"apc_list":null,"apc_paid":null,"fwci":1.0371,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.74030117,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":"21","issue":"4","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8060768246650696},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5215218663215637},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.4660061001777649},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.32258349657058716}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8060768246650696},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5215218663215637},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.4660061001777649},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.32258349657058716}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3674911","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3674911","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3674911","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3674911","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3674911","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3674911","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4400017463.pdf"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W1565301797","https://openalex.org/W1982358758","https://openalex.org/W2009654791","https://openalex.org/W2035080386","https://openalex.org/W2080090223","https://openalex.org/W2115052535","https://openalex.org/W2123736933","https://openalex.org/W2168931017","https://openalex.org/W2411480360","https://openalex.org/W2626696598","https://openalex.org/W2745560456","https://openalex.org/W2768065515","https://openalex.org/W2789228469","https://openalex.org/W2805150752","https://openalex.org/W2991825376","https://openalex.org/W3047681903","https://openalex.org/W3048232878","https://openalex.org/W3048330007","https://openalex.org/W3125710003","https://openalex.org/W3136600838","https://openalex.org/W3137858876","https://openalex.org/W3153190191","https://openalex.org/W3187657707","https://openalex.org/W4251541088","https://openalex.org/W4251637954","https://openalex.org/W4254054591","https://openalex.org/W4312107225","https://openalex.org/W4321636675","https://openalex.org/W4375867225","https://openalex.org/W4380356442"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2051487156","https://openalex.org/W2073681303","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Sparse":[0],"Triangular":[1],"Solve":[2],"(SpTRSV)":[3],"has":[4],"long":[5],"been":[6],"an":[7,103,131,227,267],"essential":[8],"kernel":[9,133,208],"in":[10],"the":[11,50,67,76,120,127,188,201,206,210,260,281],"field":[12],"of":[13,64,66,88,122,129,190,255,280],"scientific":[14],"computing.":[15],"Due":[16],"to":[17,29,48,54,78,106,165,204,221,277],"its":[18],"low":[19],"computational":[20],"intensity":[21],"and":[22,31,59,91,137,152,161,173,183,194,225,241],"internal":[23,195],"data":[24,69,192,196],"dependencies,":[25],"SpTRSV":[26,108,132],"is":[27,218],"hard":[28],"implement":[30],"optimize":[32,107,205],"on":[33,39,45,109,115,144,237],"graphics":[34],"processing":[35],"units":[36],"(GPUs).":[37],"Based":[38],"our":[40],"experimental":[41],"observations,":[42],"existing":[43],"implementations":[44,61,250],"GPUs":[46],"fail":[47],"achieve":[49],"optimal":[51],"performance":[52,95,114,216,261],"due":[53],"their":[55,72],"suboptimal":[56],"parallelism":[57,160],"setups":[58],"code":[60,150,154,163],"plus":[62],"lack":[63],"consideration":[65],"irregular":[68,191],"distribution.":[70],"Moreover,":[71],"algorithm":[73,89],"design":[74],"lacks":[75],"adaptability":[77],"different":[79],"input":[80],"matrices,":[81],"which":[82,111,185],"may":[83],"involve":[84],"substantial":[85],"manual":[86,123],"efforts":[87],"redesigning":[90],"parameter":[92],"tuning":[93],"for":[94,180,200,209],"consistency.":[96],"In":[97],"this":[98],"work,":[99],"we":[100],"propose":[101],"AG-SpTRSV,":[102],"automatic":[104],"framework":[105],"GPUs,":[110],"provides":[112],"high":[113],"various":[116,167],"matrices":[117],"while":[118],"eliminating":[119],"costs":[121,224],"design.":[124],"AG-SpTRSV":[125,156,177,198,247,264],"abstracts":[126],"procedures":[128],"optimizing":[130],"as":[134],"a":[135,139,148],"scheme":[136,203],"constructs":[138],"comprehensive":[140],"optimization":[141],"space":[142],"based":[143],"it.":[145],"By":[146],"defining":[147],"unified":[149],"template":[151],"preparing":[153],"variants,":[155],"enables":[157],"fine-grained":[158],"dynamic":[159],"adaptive":[162],"optimizations":[164],"handle":[166],"tasks.":[168],"Through":[169],"computation":[170],"graph":[171],"transformation":[172],"multi-hierarchy":[174],"heuristic":[175],"scheduling,":[176],"generates":[178],"schemes":[179],"task":[181],"partitioning":[182],"mapping,":[184],"effectively":[186],"address":[187],"issues":[189],"distribution":[193],"dependencies.":[197],"searches":[199],"best":[202],"target":[207],"specific":[211],"matrix.":[212],"A":[213],"learned":[214],"lightweight":[215],"model":[217,262],"also":[219],"introduced":[220],"reduce":[222],"search":[223],"provide":[226,266],"efficient":[228,268],"end-to-end":[229,269],"solution.":[230],"Experimental":[231],"results":[232],"with":[233,251,271],"SuiteSparse":[234],"Matrix":[235],"Collection":[236],"NVIDIA":[238],"Tesla":[239],"A100":[240],"RTX":[242],"3080":[243],"Ti":[244],"show":[245],"that":[246],"outperforms":[248],"state-of-the-art":[249],"geometric":[252],"average":[253],"speedups":[254],"2.12x":[256],"\u223c":[257],"3.99x.":[258],"With":[259],"enabled,":[263],"can":[265],"solution,":[270],"preprocessing":[272],"times":[273,279],"ranging":[274],"from":[275],"3.4":[276],"245":[278],"execution":[282],"time.":[283]},"counts_by_year":[{"year":2026,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
