{"id":"https://openalex.org/W4406754377","doi":"https://doi.org/10.1109/lca.2025.3533588","title":"GPU-Centric Memory Tiering for LLM Serving With NVIDIA Grace Hopper Superchip","display_name":"GPU-Centric Memory Tiering for LLM Serving With NVIDIA Grace Hopper Superchip","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4406754377","doi":"https://doi.org/10.1109/lca.2025.3533588"},"language":"en","primary_location":{"id":"doi:10.1109/lca.2025.3533588","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3533588","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110124768","display_name":"Won-Hee Choi","orcid":null},"institutions":[{"id":"https://openalex.org/I57664883","display_name":"Ajou University","ror":"https://ror.org/03tzb2h73","country_code":"KR","type":"education","lineage":["https://openalex.org/I57664883"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Woohyung Choi","raw_affiliation_strings":["Ajou University, Suwon, South Korea","Ajou University, Korea"],"affiliations":[{"raw_affiliation_string":"Ajou University, Suwon, South Korea","institution_ids":["https://openalex.org/I57664883"]},{"raw_affiliation_string":"Ajou University, Korea","institution_ids":["https://openalex.org/I57664883"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108260420","display_name":"Jinwoo Jeong","orcid":null},"institutions":[{"id":"https://openalex.org/I197347611","display_name":"Korea University","ror":"https://ror.org/047dqcg40","country_code":"KR","type":"education","lineage":["https://openalex.org/I197347611"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jinwoo Jeong","raw_affiliation_strings":["Korea University, Seoul, South Korea","Korea University, Korea"],"affiliations":[{"raw_affiliation_string":"Korea University, Seoul, South Korea","institution_ids":["https://openalex.org/I197347611"]},{"raw_affiliation_string":"Korea University, Korea","institution_ids":["https://openalex.org/I197347611"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013182828","display_name":"Hanhwi Jang","orcid":"https://orcid.org/0000-0003-3722-4131"},"institutions":[{"id":"https://openalex.org/I57664883","display_name":"Ajou University","ror":"https://ror.org/03tzb2h73","country_code":"KR","type":"education","lineage":["https://openalex.org/I57664883"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Hanhwi Jang","raw_affiliation_strings":["Ajou University, Suwon, South Korea","Ajou University, Korea"],"affiliations":[{"raw_affiliation_string":"Ajou University, Suwon, South Korea","institution_ids":["https://openalex.org/I57664883"]},{"raw_affiliation_string":"Ajou University, Korea","institution_ids":["https://openalex.org/I57664883"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5037825380","display_name":"Jeongseob Ahn","orcid":"https://orcid.org/0000-0002-4503-8294"},"institutions":[{"id":"https://openalex.org/I197347611","display_name":"Korea University","ror":"https://ror.org/047dqcg40","country_code":"KR","type":"education","lineage":["https://openalex.org/I197347611"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Jeongseob Ahn","raw_affiliation_strings":["Korea University, Seoul, South Korea","Korea University, Korea"],"affiliations":[{"raw_affiliation_string":"Korea University, Seoul, South Korea","institution_ids":["https://openalex.org/I197347611"]},{"raw_affiliation_string":"Korea University, Korea","institution_ids":["https://openalex.org/I197347611"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5110124768"],"corresponding_institution_ids":["https://openalex.org/I57664883"],"apc_list":null,"apc_paid":null,"fwci":3.4842,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.90219482,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":"24","issue":"1","first_page":"33","last_page":"36"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.6959999799728394,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10191","display_name":"Robotics and Sensor-Based Localization","score":0.6959999799728394,"subfield":{"id":"https://openalex.org/subfields/2202","display_name":"Aerospace Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13715","display_name":"Power Line Inspection Robots","score":0.5975000262260437,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.5906999707221985,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7657635807991028},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.49696972966194153},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.4783189594745636},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.4580269455909729},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.44740182161331177},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.42136719822883606},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3612980842590332},{"id":"https://openalex.org/keywords/semiconductor-memory","display_name":"Semiconductor memory","score":0.15130645036697388}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7657635807991028},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.49696972966194153},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4783189594745636},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4580269455909729},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.44740182161331177},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.42136719822883606},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3612980842590332},{"id":"https://openalex.org/C98986596","wikidata":"https://www.wikidata.org/wiki/Q1143031","display_name":"Semiconductor memory","level":2,"score":0.15130645036697388}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2025.3533588","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3533588","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.800000011920929,"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W4387321091","https://openalex.org/W4401211704","https://openalex.org/W6785197036","https://openalex.org/W6850927664","https://openalex.org/W6853048723"],"related_works":["https://openalex.org/W2418291489","https://openalex.org/W3096519538","https://openalex.org/W2744747300","https://openalex.org/W4241166160","https://openalex.org/W2068121105","https://openalex.org/W2384826897","https://openalex.org/W1973516247","https://openalex.org/W1997466117","https://openalex.org/W2795695574","https://openalex.org/W2300282708"],"abstract_inverted_index":{"This":[0,29],"study":[1],"investigates":[2],"the":[3,15,70,83,97,122,135,144,151,155,166,176,188,194],"performance":[4,39],"of":[5,137],"serving":[6,162],"large":[7,119,163],"language":[8],"models":[9,120,127],"(LLMs)":[10],"with":[11,41,48,111,143],"a":[12,23,32,38,45,53,77,158],"focus":[13],"on":[14],"high-bandwidth":[16],"interconnect":[17],"between":[18],"GPU":[19,42,67,107],"and":[20,44,90,124,169,181],"CPU":[21,113],"using":[22],"real":[24],"NVIDIA":[25],"Grace":[26],"Hopper":[27],"Superchip.":[28],"architecture":[30],"features":[31],"GPU-centric":[33,78],"memory":[34,43,62,80],"tiering":[35],"system,":[36],"comprising":[37],"tier":[40,47],"capacity":[46],"host":[49,61],"memory.":[50,68,108],"We":[51],"revisit":[52],"conventional":[54],"pipelined":[55,145,156,177],"execution":[56,157,178],"for":[57,87,118,161],"LLM":[58],"inference,":[59],"utilizing":[60],"connected":[63],"via":[64],"NVLink":[65],"alongside":[66],"For":[69,165],"Llama-3.1":[71,167],"8B":[72],"base":[73],"(FP16)":[74],"model,":[75],"such":[76],"tiered":[79],"system":[81],"meets":[82],"target":[84],"latency":[85,116,195],"requirements":[86],"both":[88],"prefill":[89],"decoding":[91],"while":[92,192],"improving":[93],"throughput":[94,183],"compared":[95,186],"to":[96,187],"in-memory":[98,189],"case,":[99,191],"where":[100],"all":[101],"model":[102,138,152],"weights":[103],"are":[104],"maintained":[105],"in":[106],"However,":[109],"even":[110],"NVLink-connected":[112],"memory,":[114],"meeting":[115,193],"constraints":[117],"like":[121],"70B":[123,168],"405B":[125,170],"FP16":[126],"remains":[128],"challenging.":[129],"To":[130],"address":[131],"this,":[132],"we":[133,173],"explore":[134],"efficacy":[136],"quantization":[139,153],"(e.g.,":[140],"AWQ)":[141],"along":[142],"execution.":[146],"Our":[147],"evaluation":[148],"reveals":[149],"that":[150,175],"makes":[154],"viable":[159],"solution":[160],"models.":[164],"AWQ":[171],"models,":[172],"show":[174],"achieves":[179],"1.6\u00d7":[180],"2.9\u00d7":[182],"improvement,":[184],"respectively,":[185],"only":[190],"constraint.":[196]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
