{"id":"https://openalex.org/W7162019306","doi":"https://doi.org/10.48550/arxiv.2605.20315","title":"Mix-Quant: Quantized Prefilling, Precise Decoding for Agentic LLMs","display_name":"Mix-Quant: Quantized Prefilling, Precise Decoding for Agentic LLMs","publication_year":2026,"publication_date":"2026-05-19","ids":{"openalex":"https://openalex.org/W7162019306","doi":"https://doi.org/10.48550/arxiv.2605.20315"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.20315","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20315","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.20315","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082645162","display_name":"Haiquan Lu","orcid":"https://orcid.org/0000-0001-6570-7208"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Haiquan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136647215","display_name":"Zigeng Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zigeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136667027","display_name":"Gongfan Fang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Gongfan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136613247","display_name":"Xinyin Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Xinyin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136704478","display_name":"Xinchao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinchao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07199999690055847,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.07199999690055847,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.05530000105500221,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.053700000047683716,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.7283999919891357},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6230000257492065},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.5738999843597412},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5666999816894531},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.46129998564720154},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.4049000144004822}],"concepts":[{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.7283999919891357},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6753000020980835},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6230000257492065},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.5738999843597412},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5666999816894531},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.46129998564720154},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.4049000144004822},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.387800008058548},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.36649999022483826},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.36230000853538513},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.32030001282691956},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.29170000553131104},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.27140000462532043},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.2583000063896179}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.20315","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20315","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.20315","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.20315","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"LLM":[0,67,148],"agents":[1],"have":[2],"recently":[3],"emerged":[4],"as":[5],"a":[6,36,49,172],"powerful":[7],"paradigm":[8],"for":[9,56,124],"solving":[10],"complex":[11],"tasks":[12],"through":[13],"planning,":[14],"tool":[15],"use,":[16],"memory":[17],"retrieval,":[18],"and":[19,51,69,91,154],"multi-step":[20],"interaction.":[21],"However,":[22],"these":[23],"agentic":[24,58,66,155],"workflows":[25,68],"often":[26],"introduce":[27],"substantial":[28,88],"input-side":[29],"overhead,":[30],"making":[31],"the":[32,73,84,102,117,144],"compute-intensive":[33],"prefilling":[34,85,118,128],"stage":[35,86],"key":[37],"bottleneck":[38,146],"in":[39,65,147],"long-context,":[40],"multi-turn":[41],"inference.":[42,59],"In":[43,82],"this":[44,109],"work,":[45],"we":[46,111],"propose":[47],"Mix-Quant,":[48],"simple":[50],"effective":[52],"phase-aware":[53,135],"quantization":[54,64,89,115,137],"framework":[55],"fast":[57],"We":[60],"first":[61],"investigate":[62],"FP4":[63],"observe":[70],"that":[71,158],"quantizing":[72],"entire":[74],"inference":[75,145],"process":[76],"can":[77,92],"incur":[78],"significant":[79,166],"performance":[80,163],"degradation.":[81],"contrast,":[83],"exhibits":[87],"redundancy":[90],"therefore":[93],"be":[94],"quantized":[95],"with":[96,138],"minimal":[97],"accuracy":[98],"loss,":[99],"despite":[100],"being":[101],"dominant":[103],"source":[104],"of":[105],"computation.":[106],"Based":[107],"on":[108],"insight,":[110],"apply":[112],"high-throughput":[113],"NVFP4":[114,140],"to":[116,142,171],"phase":[119],"while":[120,164],"preserving":[121],"BF16":[122],"precision":[123],"decoding.":[125],"By":[126],"decoupling":[127],"acceleration":[129],"from":[130],"decoding":[131],"quality,":[132],"Mix-Quant":[133,159],"combines":[134],"algorithmic":[136],"hardware-efficient":[139],"execution":[141],"alleviate":[143],"agents.":[149],"Extensive":[150],"experiments":[151],"across":[152],"long-context":[153],"benchmarks":[156],"demonstrate":[157],"largely":[160],"preserves":[161],"task":[162],"delivering":[165],"efficiency":[167],"improvements,":[168],"achieving":[169],"up":[170],"3x":[173],"speedup":[174],"during":[175],"prefilling.":[176]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-22T00:00:00"}
