{"id":"https://openalex.org/W4416250700","doi":"https://doi.org/10.1109/ijcnn64981.2025.11227197","title":"GraNNite: Enabling High-Performance Execution of Graph Neural Networks on Resource-Constrained Neural Processing Units","display_name":"GraNNite: Enabling High-Performance Execution of Graph Neural Networks on Resource-Constrained Neural Processing Units","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4416250700","doi":"https://doi.org/10.1109/ijcnn64981.2025.11227197"},"language":null,"primary_location":{"id":"doi:10.1109/ijcnn64981.2025.11227197","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11227197","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102891312","display_name":"Arghadip Das","orcid":"https://orcid.org/0000-0001-6043-4685"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Arghadip Das","raw_affiliation_strings":["Purdue University,Elmore Family School of Electrical and Computer Engineering,West Lafayette,IN,USA"],"affiliations":[{"raw_affiliation_string":"Purdue University,Elmore Family School of Electrical and Computer Engineering,West Lafayette,IN,USA","institution_ids":["https://openalex.org/I219193219"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066094401","display_name":"Shamik Kundu","orcid":"https://orcid.org/0000-0002-5992-8554"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shamik Kundu","raw_affiliation_strings":["Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066464351","display_name":"Arnab Raha","orcid":"https://orcid.org/0000-0002-8848-1069"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Arnab Raha","raw_affiliation_strings":["Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043334127","display_name":"Soumendu Kumar Ghosh","orcid":"https://orcid.org/0000-0001-6776-1427"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Soumendu Ghosh","raw_affiliation_strings":["Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052021148","display_name":"Deepak A. Mathaikutty","orcid":"https://orcid.org/0009-0004-3768-1337"},"institutions":[{"id":"https://openalex.org/I1343180700","display_name":"Intel (United States)","ror":"https://ror.org/01ek73717","country_code":"US","type":"company","lineage":["https://openalex.org/I1343180700"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Deepak Mathaikutty","raw_affiliation_strings":["Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA"],"affiliations":[{"raw_affiliation_string":"Intel Corporation,Advanced Architecture Research Team, NPU IP, CCG,Santa Clara,CA,USA","institution_ids":["https://openalex.org/I1343180700"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102009759","display_name":"Vijay Raghunathan","orcid":"https://orcid.org/0000-0003-4713-5386"},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Vijay Raghunathan","raw_affiliation_strings":["Purdue University,Elmore Family School of Electrical and Computer Engineering,West Lafayette,IN,USA"],"affiliations":[{"raw_affiliation_string":"Purdue University,Elmore Family School of Electrical and Computer Engineering,West Lafayette,IN,USA","institution_ids":["https://openalex.org/I219193219"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5102891312"],"corresponding_institution_ids":["https://openalex.org/I219193219"],"apc_list":null,"apc_paid":null,"fwci":5.5956,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.96128424,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.7414000034332275,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.7414000034332275,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.15700000524520874,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.017400000244379044,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.48559999465942383},{"id":"https://openalex.org/keywords/energy-consumption","display_name":"Energy consumption","score":0.45010000467300415},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.43059998750686646},{"id":"https://openalex.org/keywords/cloud-computing","display_name":"Cloud computing","score":0.42969998717308044},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.42969998717308044},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.42579999566078186},{"id":"https://openalex.org/keywords/redundancy","display_name":"Redundancy (engineering)","score":0.40540000796318054},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.37450000643730164},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.3637000024318695}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.863099992275238},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.48559999465942383},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4514000117778778},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.45010000467300415},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.43059998750686646},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.42969998717308044},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.42969998717308044},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.42579999566078186},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.40540000796318054},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.37450000643730164},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.3637000024318695},{"id":"https://openalex.org/C2778456923","wikidata":"https://www.wikidata.org/wiki/Q5337692","display_name":"Edge computing","level":3,"score":0.36000001430511475},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35589998960494995},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.3440999984741211},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.3422999978065491},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3377000093460083},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33379998803138733},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2976999878883362},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.29260000586509705},{"id":"https://openalex.org/C2781357197","wikidata":"https://www.wikidata.org/wiki/Q5757597","display_name":"High memory","level":2,"score":0.29260000586509705},{"id":"https://openalex.org/C165435473","wikidata":"https://www.wikidata.org/wiki/Q1509884","display_name":"Padding","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2800999879837036},{"id":"https://openalex.org/C61423126","wikidata":"https://www.wikidata.org/wiki/Q187432","display_name":"Scripting language","level":2,"score":0.27390000224113464},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.26930001378059387},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.26429998874664307},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2549999952316284},{"id":"https://openalex.org/C81363708","wikidata":"https://www.wikidata.org/wiki/Q17084460","display_name":"Convolutional neural network","level":2,"score":0.25459998846054077},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/ijcnn64981.2025.11227197","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ijcnn64981.2025.11227197","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 International Joint Conference on Neural Networks (IJCNN)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":21,"referenced_works":["https://openalex.org/W2558748708","https://openalex.org/W2907492528","https://openalex.org/W2962821792","https://openalex.org/W3047846843","https://openalex.org/W3080083111","https://openalex.org/W3154000858","https://openalex.org/W3158353424","https://openalex.org/W3200832253","https://openalex.org/W4312281374","https://openalex.org/W4313145155","https://openalex.org/W4362473790","https://openalex.org/W4382866877","https://openalex.org/W4386076442","https://openalex.org/W4386764393","https://openalex.org/W4386790419","https://openalex.org/W4387199936","https://openalex.org/W4392867133","https://openalex.org/W4396811547","https://openalex.org/W4400796231","https://openalex.org/W4404133668","https://openalex.org/W4405718104"],"related_works":[],"abstract_inverted_index":{"Graph":[0,198],"Neural":[1],"Networks":[2],"(GNNs)":[3],"are":[4],"crucial":[5],"for":[6,51,142,160,165,176,189,194,246],"learning":[7],"and":[8,20,32,39,56,67,74,86,138,145,163,171,192,203,207,230,236,242,261,288,299,305],"reasoning":[9],"over":[10,274],"graph-structured":[11],"data,":[12],"with":[13,97,216,249,278],"applications":[14],"in":[15],"network":[16],"analysis,":[17],"recommendation":[18],"systems,":[19],"speech":[21],"analytics.":[22],"Deploying":[23],"them":[24],"on":[25,77,117,133,255],"edge":[26,81],"devices,":[27],"such":[28,157,186],"as":[29,158,187],"client":[30],"PCs":[31,264],"laptops,":[33],"enables":[34],"real-time":[35,174],"processing,":[36],"enhances":[37],"privacy,":[38],"reduces":[40],"cloud":[41],"dependency.":[42],"For":[43,197],"instance,":[44],"GNNs":[45],"can":[46],"augment":[47],"Retrieval-Augmented":[48],"Generation":[49],"(RAG)":[50],"Large":[52],"Language":[53],"Models":[54],"(LLMs)":[55],"enable":[57],"event-based":[58],"vision":[59],"tasks.":[60],"However,":[61],"irregular":[62,98],"memory":[63,208,228],"access,":[64],"sparse":[65],"graphs,":[66],"dynamic":[68,177],"structures":[69],"lead":[70],"to":[71,113,226,272,283,286,297],"high":[72],"latency":[73],"energy":[75,146,279],"consumption":[76],"resource-constrained":[78],"devices.":[79],"Modern":[80],"processors":[82],"combine":[83],"CPUs,":[84],"GPUs,":[85,306],"NPUs,":[87,134],"where":[88],"NPUs":[89],"excel":[90],"at":[91,313],"data-parallel":[92],"tasks":[93],"but":[94],"face":[95],"challenges":[96],"GNN":[99,115,131,292],"computations.":[100],"To":[101],"address":[102],"these":[103],"gaps,":[104],"we":[105],"present":[106],"GraNNite,":[107],"the":[108,152],"first":[109,153],"hardware-aware":[110],"framework":[111],"tailored":[112],"optimize":[114,238],"deployment":[116],"commercial-off-the-shelf":[118],"(COTS)":[119],"state-of-the-art":[120],"(SOTA)":[121],"DNN":[122],"accelerators":[123],"using":[124],"a":[125],"systematic":[126],"three-step":[127],"methodology:":[128],"(1)":[129],"enabling":[130],"execution":[132],"(2)":[135],"optimizing":[136],"performance,":[137],"(3)":[139],"trading":[140],"accuracy":[141],"further":[143],"performance":[144,180,302],"efficiency":[147,219,280],"gains.":[148],"Towards":[149],"that":[150,266],"end,":[151],"category":[154],"includes":[155],"techniques":[156,185,214],"GraphSplit":[159],"workload":[161],"distribution":[162],"StaGr":[164],"static":[166],"graph":[167,239],"aggregation,":[168],"while":[169,233],"GrAd":[170],"NodePad":[172],"handle":[173],"updates":[175],"graphs.":[178],"Next,":[179],"improvement":[181],"is":[182,311],"acquired":[183],"through":[184],"EffOp":[188],"control-heavy":[190],"operations":[191],"GraSp":[193],"sparsity":[195],"exploitation.":[196],"Convolution":[199],"layers,":[200],"PreG,":[201],"SymG,":[202],"CacheG":[204],"reduce":[205],"redundancy":[206],"transfers.":[209],"The":[210],"final":[211],"class":[212],"of":[213,270],"deals":[215],"quality":[217,251],"vs":[218],"tradeoffs":[220],"\u2013":[221],"QuantGr":[222],"applies":[223],"INT8":[224],"quantization":[225],"lower":[227],"usage":[229],"computation":[231],"time,":[232],"GrAx1,":[234],"GrAx2,":[235],"GrAx3":[237],"attention,":[240],"broadcast-add,":[241],"sample-and-aggregate":[243],"(SAGE)-max":[244],"aggregation":[245],"higher":[247,301],"throughput":[248],"minimal":[250],"loss.":[252],"Experimental":[253],"evaluations":[254],"Intel\u00ae":[256],"Core\u2122":[257],"Ultra":[258],"Series":[259],"1":[260],"2":[262],"AI":[263],"demonstrate":[265],"GraNNite":[267,294],"achieves":[268],"speedups":[269],"2.6\u00d7":[271],"7.6\u00d7":[273],"default":[275],"NPU":[276],"mappings,":[277],"improvements":[281],"up":[282,296],"8.6\u00d7":[284],"compared":[285],"CPUs":[287,304],"GPUs.":[289],"Across":[290],"various":[291],"models,":[293],"delivers":[295],"10.8\u00d7":[298],"6.7\u00d7":[300],"than":[303],"respectively.":[307],"Our":[308],"code":[309],"implementation":[310],"available":[312],"this":[314],"link.":[315]},"counts_by_year":[{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-09T08:58:05.943551","created_date":"2025-11-14T00:00:00"}
