{"id":"https://openalex.org/W4399282122","doi":"https://doi.org/10.1145/3650200.3656593","title":"FASTEN: Fast GPU-accelerated Segmented Matrix Multiplication for Heterogenous Graph Neural Networks","display_name":"FASTEN: Fast GPU-accelerated Segmented Matrix Multiplication for Heterogenous Graph Neural Networks","publication_year":2024,"publication_date":"2024-05-30","ids":{"openalex":"https://openalex.org/W4399282122","doi":"https://doi.org/10.1145/3650200.3656593"},"language":"en","primary_location":{"id":"doi:10.1145/3650200.3656593","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656593","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656593","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656593","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063326523","display_name":"Keren Zhou","orcid":"https://orcid.org/0000-0002-7977-3182"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Keren Zhou","raw_affiliation_strings":["George Mason University, USA"],"affiliations":[{"raw_affiliation_string":"George Mason University, USA","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040442274","display_name":"Karthik Ganapathi Subramanian","orcid":"https://orcid.org/0009-0006-7709-1184"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Karthik Ganapathi Subramanian","raw_affiliation_strings":["North Carolina State University, USA"],"affiliations":[{"raw_affiliation_string":"North Carolina State University, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103049544","display_name":"Po-Hsun Lin","orcid":"https://orcid.org/0009-0002-6466-276X"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Po-Hsun Lin","raw_affiliation_strings":["North Carolina State University, USA"],"affiliations":[{"raw_affiliation_string":"North Carolina State University, USA","institution_ids":["https://openalex.org/I137902535"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002742256","display_name":"Matthias Fey","orcid":"https://orcid.org/0000-0002-5727-0701"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matthias Fey","raw_affiliation_strings":["Kumo.AI, USA"],"affiliations":[{"raw_affiliation_string":"Kumo.AI, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009761999","display_name":"Binqian Yin","orcid":"https://orcid.org/0009-0000-7336-0507"},"institutions":[{"id":"https://openalex.org/I162714631","display_name":"George Mason University","ror":"https://ror.org/02jqj7156","country_code":"US","type":"education","lineage":["https://openalex.org/I162714631"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Binqian Yin","raw_affiliation_strings":["George Mason University, USA"],"affiliations":[{"raw_affiliation_string":"George Mason University, USA","institution_ids":["https://openalex.org/I162714631"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100454305","display_name":"Jiajia Li","orcid":"https://orcid.org/0000-0003-1270-4147"},"institutions":[{"id":"https://openalex.org/I137902535","display_name":"North Carolina State University","ror":"https://ror.org/04tj63d06","country_code":"US","type":"education","lineage":["https://openalex.org/I137902535"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiajia Li","raw_affiliation_strings":["North Carolina State University, USA"],"affiliations":[{"raw_affiliation_string":"North Carolina State University, USA","institution_ids":["https://openalex.org/I137902535"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5063326523"],"corresponding_institution_ids":["https://openalex.org/I162714631"],"apc_list":null,"apc_paid":null,"fwci":0.3475,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.63133109,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"511","last_page":"524"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9975000023841858,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8469111919403076},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.6516121029853821},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5704624056816101},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.45239442586898804},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.4429781436920166},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.41475972533226013},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.39525681734085083},{"id":"https://openalex.org/keywords/theoretical-computer-science","display_name":"Theoretical computer science","score":0.38963770866394043},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.25006160140037537},{"id":"https://openalex.org/keywords/mathematical-optimization","display_name":"Mathematical optimization","score":0.12360602617263794}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8469111919403076},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6516121029853821},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5704624056816101},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.45239442586898804},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4429781436920166},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.41475972533226013},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.39525681734085083},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.38963770866394043},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25006160140037537},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.12360602617263794},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3650200.3656593","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656593","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656593","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3650200.3656593","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3650200.3656593","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3650200.3656593","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 38th ACM International Conference on Supercomputing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4399282122.pdf"},"referenced_works_count":31,"referenced_works":["https://openalex.org/W1529533208","https://openalex.org/W1968902482","https://openalex.org/W2002555321","https://openalex.org/W2079038734","https://openalex.org/W2099021415","https://openalex.org/W2134427337","https://openalex.org/W2410071687","https://openalex.org/W2604314403","https://openalex.org/W2605251767","https://openalex.org/W2906943923","https://openalex.org/W2907492528","https://openalex.org/W2913790721","https://openalex.org/W2953264125","https://openalex.org/W3004507689","https://openalex.org/W3012871709","https://openalex.org/W3103513278","https://openalex.org/W3105696640","https://openalex.org/W3111142806","https://openalex.org/W3132185085","https://openalex.org/W3136358373","https://openalex.org/W3142588439","https://openalex.org/W3160021293","https://openalex.org/W3187018213","https://openalex.org/W3203965336","https://openalex.org/W4200051960","https://openalex.org/W4226237846","https://openalex.org/W4312324806","https://openalex.org/W4327694885","https://openalex.org/W4327911434","https://openalex.org/W4385585441","https://openalex.org/W6689518644"],"related_works":["https://openalex.org/W2000785801","https://openalex.org/W986318368","https://openalex.org/W2384410913","https://openalex.org/W2352878646","https://openalex.org/W2004734601","https://openalex.org/W2130149817","https://openalex.org/W2990194547","https://openalex.org/W1480123525","https://openalex.org/W1925544630","https://openalex.org/W2004686618"],"abstract_inverted_index":{"This":[0],"paper":[1],"introduces":[2],"FASTEN,":[3],"a":[4,32,56,77],"cutting-edge":[5],"library":[6],"developed":[7],"to":[8,52,81,90,122],"address":[9],"the":[10,26,83],"computational":[11],"challenges":[12],"inherent":[13],"in":[14,100],"Heterogeneous":[15],"Graph":[16],"Neural":[17],"Networks":[18],"(HGNNs).":[19],"The":[20],"key":[21],"focus":[22],"of":[23,28,50,70],"FASTEN":[24,46,87,125],"is":[25],"optimization":[27],"segmented":[29,74],"matrix":[30],"multiplication,":[31],"critical":[33],"operator":[34],"where":[35],"existing":[36,101],"GNN":[37],"frameworks":[38,94],"and":[39,73,76,120,130,136],"linear":[40],"algebra":[41],"libraries":[42],"often":[43],"fall":[44],"short.":[45],"offers":[47],"an":[48],"array":[49],"solutions":[51],"these":[53],"challenges,":[54],"including":[55,116],"routing":[57],"table":[58],"designed":[59],"for":[60,67],"efficient":[61],"workload":[62],"scheduling,":[63],"adaptive":[64],"algorithms":[65],"tailored":[66],"handling":[68],"segments":[69],"different":[71],"shapes":[72],"dimensions,":[75],"performance":[78,132],"model-guided":[79],"autotuner":[80],"select":[82],"best":[84],"configurations.":[85],"Furthermore,":[86],"implements":[88],"interfaces":[89],"integrate":[91],"with":[92,104],"widely-used":[93],"like":[95],"PyG,":[96],"ensuring":[97],"straightforward":[98],"adoption":[99],"HGNN":[102],"models":[103],"minimal":[105],"adjustments.":[106],"We":[107],"have":[108],"performed":[109],"comprehensive":[110],"benchmarks":[111],"on":[112],"advanced":[113],"GPU":[114],"architectures,":[115],"NVIDIA":[117],"H100,":[118],"A100,":[119],"RTX4090,":[121],"demonstrate":[123],"that":[124],"significantly":[126],"improves":[127],"both":[128],"operator-wise":[129],"end-to-end":[131],"across":[133],"various":[134],"datasets":[135],"HGNNs.":[137]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
