{"id":"https://openalex.org/W4411486169","doi":"https://doi.org/10.1145/3695053.3731051","title":"Hybe: GPU-NPU Hybrid System for Efficient LLM Inference with Million-Token Context Window","display_name":"Hybe: GPU-NPU Hybrid System for Efficient LLM Inference with Million-Token Context Window","publication_year":2025,"publication_date":"2025-06-20","ids":{"openalex":"https://openalex.org/W4411486169","doi":"https://doi.org/10.1145/3695053.3731051"},"language":"en","primary_location":{"id":"doi:10.1145/3695053.3731051","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731051","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731051","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731051","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030449068","display_name":"Seungjae Moon","orcid":"https://orcid.org/0009-0002-5924-7000"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Seungjae Moon","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0002-5924-7000","affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092947137","display_name":"Junseo Cha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Junseo Cha","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0004-4415-8552","affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083697754","display_name":"Hyunjun Park","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hyunjun Park","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0009-0001-4036-7590","affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100447377","display_name":"Joo-Young Kim","orcid":"https://orcid.org/0000-0003-1099-1496"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Joo-Young Kim","raw_affiliation_strings":["HyperAccel, Seoul, Republic of Korea"],"raw_orcid":"https://orcid.org/0000-0003-1099-1496","affiliations":[{"raw_affiliation_string":"HyperAccel, Seoul, Republic of Korea","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5030449068"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":6.5198,"has_fulltext":true,"cited_by_count":3,"citation_normalized_percentile":{"value":0.9622585,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"808","last_page":"820"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9835000038146973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.9830999970436096,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9690999984741211,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7692082524299622},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7607978582382202},{"id":"https://openalex.org/keywords/window","display_name":"Window (computing)","score":0.7141740322113037},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5815184712409973},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5075458288192749},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.32621926069259644},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.261574923992157},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2531431019306183},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2107117772102356}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7692082524299622},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7607978582382202},{"id":"https://openalex.org/C2778751112","wikidata":"https://www.wikidata.org/wiki/Q835016","display_name":"Window (computing)","level":2,"score":0.7141740322113037},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5815184712409973},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5075458288192749},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.32621926069259644},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.261574923992157},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2531431019306183},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2107117772102356},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3695053.3731051","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731051","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731051","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3695053.3731051","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3695053.3731051","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3695053.3731051","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 52nd Annual International Symposium on Computer Architecture","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4411486169.pdf","grobid_xml":"https://content.openalex.org/works/W4411486169.grobid-xml"},"referenced_works_count":18,"referenced_works":["https://openalex.org/W2034861439","https://openalex.org/W2926767350","https://openalex.org/W3006732000","https://openalex.org/W3101708369","https://openalex.org/W3130554079","https://openalex.org/W4281708879","https://openalex.org/W4285225959","https://openalex.org/W4308083513","https://openalex.org/W4321636575","https://openalex.org/W4324292875","https://openalex.org/W4387321091","https://openalex.org/W4389518760","https://openalex.org/W4389524555","https://openalex.org/W4392427708","https://openalex.org/W4393578753","https://openalex.org/W4395073431","https://openalex.org/W4400409880","https://openalex.org/W4401211590"],"related_works":["https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W4307309205","https://openalex.org/W2967478618","https://openalex.org/W4385009901","https://openalex.org/W4385572700","https://openalex.org/W2997152889","https://openalex.org/W2390901981","https://openalex.org/W380079223"],"abstract_inverted_index":{"The":[0],"growth":[1],"of":[2,18,46,94],"context":[3,59,104,140,250,260],"window":[4,251],"size":[5],"in":[6,50,67,100,205,236],"large":[7,44,103],"language":[8],"model":[9,122],"(LLM)":[10],"inference":[11,29,40,136],"poses":[12],"a":[13,78,129,138,182],"very":[14],"distinct":[15],"computational":[16,25],"challenge":[17],"hardware":[19,65,175,220],"inefficiency.The":[20],"inefficiency":[21],"arises":[22],"from":[23,64,192],"the":[24,31,51,56,68,88,110,115,120,143,147,155,161,168,193,196,201,206],"imbalance":[26],"during":[27,154],"LLM":[28,135],"between":[30],"compute-intensive":[32],"prefill":[33,52,148],"stage,":[34,53,70],"and":[35,150,232,252],"memory-intensive":[36],"decode":[37,69,156],"stage.The":[38],"predominant":[39],"hardware,":[41],"GPU,":[42],"boasts":[43],"number":[45],"cores":[47],"to":[48,86,165,195,218,221],"excel":[49],"which":[54,71,198],"processes":[55],"entire":[57,121],"input":[58],"at":[60,77],"once,":[61],"but":[62],"suffers":[63],"underutilization":[66,89],"iteratively":[72],"generates":[73],"one":[74],"output":[75],"token":[76],"time.In":[79],"conventional":[80],"LLM,":[81],"batching":[82,97],"has":[83],"been":[84],"able":[85],"alleviate":[87],"by":[90],"generating":[91],"multiple":[92],"tokens":[93,108],"different":[95],"requests.However,":[96],"becomes":[98],"infeasible":[99],"models":[101],"with":[102,137,228,239,248,258,265],"windows":[105],"over":[106,262],"100K":[107],"because":[109],"Key-Value":[111],"(KV)":[112],"activations":[113],"dominate":[114],"physical":[116],"memory":[117,170,203],"capacity,":[118],"surpassing":[119],"size.In":[123],"this":[124],"paper,":[125],"we":[126],"propose":[127],"Hybe,":[128],"GPU-NPU":[130],"hybrid":[131],"system":[132],"for":[133,146,246,256],"efficient":[134],"million-token":[139],"window.Hybe":[141],"utilizes":[142,224],"preexisting":[144],"GPU":[145,194,227],"stage":[149],"employs":[151],"lightweight":[152],"NPUs":[153],"stage.Each":[157],"NPU":[158,235],"includes":[159],"only":[160],"necessary":[162],"computing":[163],"resources":[164],"fully":[166],"utilize":[167],"given":[169],"bandwidth,":[171],"thereby":[172],"achieving":[173],"maximum":[174],"efficiency.Furthermore,":[176],"Hybe":[177,208,234],"introduces":[178],"fine-grained":[179],"KV":[180,190,202],"transmission,":[181],"kernel":[183],"scheduling":[184],"method":[185],"that":[186,213],"immediately":[187],"offloads":[188],"partial":[189],"produced":[191],"NPU,":[197],"significantly":[199],"reduces":[200],"required":[204],"GPU.Lastly,":[207],"scheduler":[209],"applies":[210],"stage-wise":[211],"pipelining":[212],"dynamically":[214],"assigns":[215],"queued":[216],"requests":[217],"idle":[219],"minimize":[222],"stalls.Hybe":[223],"NVIDIA":[225],"H100":[226,263],"inference-optimized":[229],"vLLM":[230],"library":[231],"implement":[233],"4nm":[237],"process":[238],"equal":[240,266],"HBM":[241],"specification.Hybe":[242],"achieves":[243],"2.1":[244],"speedup":[245],"Phi-3":[247],"100K-token":[249],"3.9":[253],"energy":[254],"efficiency":[255],"Llama-3":[257],"1M-token":[259],"window,":[261],"GPUs":[264],"total":[267],"device":[268],"count.":[269]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2}],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-10-10T00:00:00"}
