{"id":"https://openalex.org/W4409248815","doi":"https://doi.org/10.1109/hpca61900.2025.00129","title":"Make LLM Inference Affordable to Everyone: Augmenting GPU Memory with NDP-DIMM","display_name":"Make LLM Inference Affordable to Everyone: Augmenting GPU Memory with NDP-DIMM","publication_year":2025,"publication_date":"2025-03-01","ids":{"openalex":"https://openalex.org/W4409248815","doi":"https://doi.org/10.1109/hpca61900.2025.00129"},"language":"en","primary_location":{"id":"doi:10.1109/hpca61900.2025.00129","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00129","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103253476","display_name":"Lian Liu","orcid":"https://orcid.org/0000-0003-2226-2303"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lian Liu","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022007971","display_name":"Shixin Zhao","orcid":"https://orcid.org/0000-0002-5175-7025"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shixin Zhao","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100451305","display_name":"Bing Li","orcid":"https://orcid.org/0000-0003-0732-2267"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bing Li","raw_affiliation_strings":["Institute of Microelectronics,Chinese Academy of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Microelectronics,Chinese Academy of Sciences","institution_ids":["https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Haimeng Ren","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haimeng Ren","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5036683049","display_name":"Zhichao Xu","orcid":"https://orcid.org/0000-0002-2370-4487"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaohui Xu","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100707463","display_name":"Mengdi Wang","orcid":"https://orcid.org/0000-0002-7012-2308"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengdi Wang","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107892598","display_name":"Xiaowei Li","orcid":"https://orcid.org/0009-0004-2060-7384"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaowei Li","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101512833","display_name":"Yinhe Han","orcid":"https://orcid.org/0000-0001-5113-8067"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yinhe Han","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100347055","display_name":"Ying Wang","orcid":"https://orcid.org/0000-0002-0682-2609"},"institutions":[{"id":"https://openalex.org/I4210090176","display_name":"Institute of Computing Technology","ror":"https://ror.org/0090r4d87","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210090176"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ying Wang","raw_affiliation_strings":["Institute of Computing Technology,Chinese Academic of Sciences"],"affiliations":[{"raw_affiliation_string":"Institute of Computing Technology,Chinese Academic of Sciences","institution_ids":["https://openalex.org/I4210090176"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5103253476"],"corresponding_institution_ids":["https://openalex.org/I4210090176"],"apc_list":null,"apc_paid":null,"fwci":5.8915,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.95957725,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1751","last_page":"1765"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7310000061988831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.7310000061988831,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.6277999877929688,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6457514762878418},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5479774475097656},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4788517951965332},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.3767171800136566},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.24534043669700623}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6457514762878418},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5479774475097656},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4788517951965332},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3767171800136566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.24534043669700623}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca61900.2025.00129","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca61900.2025.00129","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W2122563027","https://openalex.org/W2149172130","https://openalex.org/W2605347906","https://openalex.org/W2762749859","https://openalex.org/W2904929935","https://openalex.org/W2979719709","https://openalex.org/W2982008795","https://openalex.org/W2998617917","https://openalex.org/W3043023836","https://openalex.org/W3081168214","https://openalex.org/W3113276786","https://openalex.org/W3210580311","https://openalex.org/W4249322926","https://openalex.org/W4253707770","https://openalex.org/W4360831749","https://openalex.org/W4379116086","https://openalex.org/W4386763970","https://openalex.org/W4387064011","https://openalex.org/W4387302777","https://openalex.org/W4387321091","https://openalex.org/W4388757726","https://openalex.org/W4392427708","https://openalex.org/W4394998968","https://openalex.org/W4395073431","https://openalex.org/W4395106409","https://openalex.org/W4400985603","https://openalex.org/W4401211602","https://openalex.org/W4404133677","https://openalex.org/W4404401018","https://openalex.org/W4404848672","https://openalex.org/W6681290437","https://openalex.org/W6727099177","https://openalex.org/W6769627184","https://openalex.org/W6788175385","https://openalex.org/W6811340617","https://openalex.org/W6850927664","https://openalex.org/W6854866820","https://openalex.org/W6857232268","https://openalex.org/W6857551316","https://openalex.org/W6858453470","https://openalex.org/W6861747715","https://openalex.org/W6862046771"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0],"billion-scale":[1],"Large":[2],"Language":[3],"Models":[4],"(LLMs)":[5],"necessitate":[6],"deployment":[7,266],"on":[8,28,269,291],"expensive":[9,45],"server-grade":[10],"GPUs":[11,46],"with":[12],"large-storage":[13],"HBMs":[14],"and":[15,58,117,211,214,239,243,278],"abundant":[16],"computation":[17,195],"capability.":[18],"As":[19],"LLM-assisted":[20],"services":[21],"become":[22],"popular,":[23],"achieving":[24,95],"cost-effective":[25],"LLM":[26,42,97],"inference":[27,63,289],"budget-friendly":[29,73],"hardware":[30,271],"becomes":[31],"the":[32,52,56,62,77,88,102,137,147,159,199,265,286],"current":[33],"trend.":[34],"This":[35,68],"has":[36],"sparked":[37],"extensive":[38],"research":[39],"into":[40,112],"relocating":[41],"parameters":[43,111,151],"from":[44],"to":[47,86,174,186,253],"external":[48],"host":[49,57],"memory.":[50],"However,":[51],"restricted":[53],"bandwidth":[54],"between":[55,241],"GPU":[59,178,242],"memory":[60,191],"limits":[61],"performance":[64,89],"of":[65,90,125,129,136,150,158,202,209,217,267,275],"existing":[66],"solutions.":[67],"work":[69],"introduces":[70],"Hermes,":[71],"a":[72,91,167,175,206,230,248,273],"system":[74,290],"that":[75,101,233],"leverages":[76],"near-data":[78],"processing":[79],"units":[80],"(NDP)":[81],"within":[82],"commodity":[83],"DRAM":[84],"DIMMs":[85],"enhance":[87],"single":[92,176],"consumer-grade":[93,270],"GPU,":[94],"efficient":[96],"inference.":[98],"We":[99],"recognize":[100],"inherent":[103],"activation":[104,203],"sparsity":[105,204],"in":[106],"LLMs":[107],"naturally":[108],"divides":[109],"weight":[110,131],"two":[113],"categories,":[114],"termed":[115],"\u201chot\u201d":[116],"\u201ccold\u201d":[118],"neurons,":[119,122],"respectively.":[120],"Hot":[121],"which":[123,188],"consist":[124],"only":[126],"approximately":[127],"20%":[128,157],"all":[130],"parameters,":[132],"account":[133],"for":[134,155],"80%":[135,149],"total":[138],"computational":[139,160],"load.":[140],"In":[141,197,261],"contrast,":[142],"cold":[143,184,212,218],"neurons":[144,173,185,213,219],"make":[145],"up":[146],"other":[148],"but":[152,193],"are":[153],"responsible":[154],"just":[156],"workload.":[161],"Leveraging":[162],"this":[163],"observation,":[164],"we":[165,228,246],"propose":[166],"heterogeneous":[168],"computing":[169],"strategy:":[170],"mapping":[171],"hot":[172,210],"computation-efficient":[177],"without":[179],"large-capacity":[180],"HBMs,":[181],"while":[182],"offloading":[183],"NDP-DIMMs,":[187],"offer":[189],"large":[190],"size":[192],"limited":[194],"capabilities.":[196],"addition,":[198],"dynamic":[200],"nature":[201],"necessitates":[205],"real-time":[207,236],"partition":[208,238],"adaptive":[215],"remapping":[216],"across":[220],"multiple":[221,258],"NDP-DIMM":[222,259],"modules.":[223,260],"To":[224],"tackle":[225],"these":[226],"issues,":[227],"introduce":[229],"lightweight":[231],"predictor":[232],"ensures":[234],"optimal":[235],"neuron":[237],"adjustment":[240],"NDP-DIMMs.":[244],"Furthermore,":[245],"utilize":[247],"window-based":[249],"online":[250],"scheduling":[251],"mechanism":[252],"maintain":[254],"load":[255],"balance":[256],"among":[257],"summary,":[262],"Hermes":[263],"facilitates":[264],"LLaMA2-70B":[268],"at":[272],"rate":[274],"13.75":[276],"tokens/s":[277],"realizes":[279],"an":[280],"average":[281],"75.24":[282],"\u00d7":[283],"speedup":[284],"over":[285],"state-of-the-art":[287],"offloading-based":[288],"popular":[292],"LLMs.":[293]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":2}],"updated_date":"2026-04-17T18:11:37.981687","created_date":"2025-10-10T00:00:00"}
