{"id":"https://openalex.org/W7131450292","doi":"https://doi.org/10.48550/arxiv.2602.18755","title":"BiScale: Energy-Efficient Disaggregated LLM Serving via Phase-Aware Placement and DVFS","display_name":"BiScale: Energy-Efficient Disaggregated LLM Serving via Phase-Aware Placement and DVFS","publication_year":2026,"publication_date":"2026-02-21","ids":{"openalex":"https://openalex.org/W7131450292","doi":"https://doi.org/10.48550/arxiv.2602.18755"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2602.18755","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18755","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2602.18755","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126664874","display_name":"Omar Basit","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Basit, Omar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053383331","display_name":"Yibo Liu","orcid":"https://orcid.org/0000-0003-4355-5527"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Yunzhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126850648","display_name":"Z. Jonny Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Z. Jonny","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126811372","display_name":"Y. Charlie Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Y. Charlie","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5126664874"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6061999797821045,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.6061999797821045,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.15700000524520874,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.061500001698732376,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5673999786376953},{"id":"https://openalex.org/keywords/queue","display_name":"Queue","score":0.5320000052452087},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.5309000015258789},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.49480000138282776},{"id":"https://openalex.org/keywords/provisioning","display_name":"Provisioning","score":0.448199987411499},{"id":"https://openalex.org/keywords/concurrency","display_name":"Concurrency","score":0.41440001130104065},{"id":"https://openalex.org/keywords/energy","display_name":"Energy (signal processing)","score":0.4059999883174896},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4009000062942505},{"id":"https://openalex.org/keywords/energy-consumption","display_name":"Energy consumption","score":0.3752000033855438},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.36880001425743103}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7824000120162964},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5673999786376953},{"id":"https://openalex.org/C160403385","wikidata":"https://www.wikidata.org/wiki/Q220543","display_name":"Queue","level":2,"score":0.5320000052452087},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.5309000015258789},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.49480000138282776},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.4864000082015991},{"id":"https://openalex.org/C172191483","wikidata":"https://www.wikidata.org/wiki/Q1071806","display_name":"Provisioning","level":2,"score":0.448199987411499},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.41440001130104065},{"id":"https://openalex.org/C186370098","wikidata":"https://www.wikidata.org/wiki/Q442787","display_name":"Energy (signal processing)","level":2,"score":0.4059999883174896},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.39579999446868896},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.3752000033855438},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.36880001425743103},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.34950000047683716},{"id":"https://openalex.org/C157742956","wikidata":"https://www.wikidata.org/wiki/Q3237776","display_name":"Frequency scaling","level":3,"score":0.34709998965263367},{"id":"https://openalex.org/C93682380","wikidata":"https://www.wikidata.org/wiki/Q2025226","display_name":"Static timing analysis","level":2,"score":0.3449999988079071},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.34369999170303345},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.3303000032901764},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.3285999894142151},{"id":"https://openalex.org/C12725497","wikidata":"https://www.wikidata.org/wiki/Q810247","display_name":"Baseline (sea)","level":2,"score":0.322299987077713},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C22684755","wikidata":"https://www.wikidata.org/wiki/Q847526","display_name":"Queueing theory","level":2,"score":0.3084999918937683},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.296999990940094},{"id":"https://openalex.org/C2775924081","wikidata":"https://www.wikidata.org/wiki/Q55608371","display_name":"Control (management)","level":2,"score":0.29429998993873596},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2922999858856201},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.29190000891685486},{"id":"https://openalex.org/C32022120","wikidata":"https://www.wikidata.org/wiki/Q797225","display_name":"Interference (communication)","level":3,"score":0.2883000075817108},{"id":"https://openalex.org/C41045048","wikidata":"https://www.wikidata.org/wiki/Q202843","display_name":"Linear programming","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.2743000090122223},{"id":"https://openalex.org/C2778234956","wikidata":"https://www.wikidata.org/wiki/Q4683813","display_name":"Admission control","level":3,"score":0.2727999985218048},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2685999870300293},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.2669000029563904},{"id":"https://openalex.org/C139807058","wikidata":"https://www.wikidata.org/wiki/Q352374","display_name":"Adaptation (eye)","level":2,"score":0.26460000872612},{"id":"https://openalex.org/C2778774385","wikidata":"https://www.wikidata.org/wiki/Q4437810","display_name":"Power management","level":3,"score":0.26339998841285706},{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.26179999113082886},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.25679999589920044}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2602.18755","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18755","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2602.18755","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.18755","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"score":0.8933796286582947,"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Prefill/decode":[0],"disaggregation":[1,40],"is":[2,27,41],"increasingly":[3],"adopted":[4],"in":[5,177,181],"LLM":[6,21,63],"serving":[7,149,157],"to":[8,30,117,132,175,184],"improve":[9],"the":[10],"latency-throughput":[11],"tradeoff":[12],"and":[13,17,35,46,50,69,73,78,88,122,126,179],"meet":[14],"strict":[15,148],"TTFT":[16,124],"TPOT":[18],"SLOs.":[19,150],"However,":[20],"inference":[22],"remains":[23],"energy-hungry:":[24],"autoscaling":[25],"alone":[26],"too":[28],"coarse-grained":[29],"track":[31],"fast":[32],"workload":[33],"fluctuations,":[34],"applying":[36],"fine-grained":[37],"DVFS":[38,70],"under":[39],"complicated":[42],"by":[43,173],"phase-asymmetric":[44],"dynamics":[45],"coupling":[47],"between":[48],"provisioning":[49],"frequency":[51,105],"control.":[52],"We":[53],"present":[54],"BiScale,":[55],"a":[56,153],"two-tier":[57],"energy":[58,93,172],"optimization":[59],"framework":[60],"for":[61,115,119,130],"disaggregated":[62],"serving.":[64],"BiScale":[65,84,101,166],"jointly":[66],"optimizes":[67],"placement":[68,87],"across":[71,144],"prefill":[72,116,178],"decode":[74,131,182],"using":[75,108],"predictive":[76,112],"latency":[77],"power":[79],"models.":[80],"At":[81,98],"coarse":[82],"timescales,":[83,100],"computes":[85],"phase-aware":[86],"baseline":[89],"frequencies":[90],"that":[91,165],"minimize":[92],"while":[94,146,170],"satisfying":[95],"SLO":[96],"constraints.":[97],"fine":[99],"dynamically":[102],"adapts":[103],"GPU":[104],"per":[106],"iteration":[107],"stage-specific":[109],"control:":[110],"model":[111],"control":[113,143],"(MPC)":[114],"account":[118],"queue":[120],"evolution":[121],"future":[123],"impact,":[125],"lightweight":[127],"slack-aware":[128],"adaptation":[129],"exploit":[133],"its":[134],"smoother,":[135],"memory-bound":[136],"dynamics.":[137],"This":[138],"hierarchical":[139],"design":[140],"enables":[141],"coordinated":[142],"timescales":[145],"preserving":[147],"Evaluation":[151],"on":[152],"16x":[154],"H100":[155],"cluster":[156],"Llama":[158],"3.3":[159],"70B":[160],"with":[161],"production-style":[162],"traces":[163],"shows":[164],"meets":[167],"TTFT/TPOT":[168],"SLOs":[169],"reducing":[171],"up":[174],"39%":[176],"48%":[180],"relative":[183],"DistServe.":[185]},"counts_by_year":[],"updated_date":"2026-02-26T06:34:08.959763","created_date":"2026-02-26T00:00:00"}
