{"id":"https://openalex.org/W7153261982","doi":"https://doi.org/10.48550/arxiv.2604.07888","title":"Bit-by-Bit: Progressive QAT Strategy with Outlier Channel Splitting for Stable Low-Bit LLMs","display_name":"Bit-by-Bit: Progressive QAT Strategy with Outlier Channel Splitting for Stable Low-Bit LLMs","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7153261982","doi":"https://doi.org/10.48550/arxiv.2604.07888"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.07888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.07888","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133324708","display_name":"Binxing Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xu, Binxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050333667","display_name":"Hao Gu","orcid":"https://orcid.org/0000-0002-3110-2531"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133345189","display_name":"Lujun Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lujun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133316939","display_name":"Hao Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133381236","display_name":"Bei Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Bei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133329951","display_name":"Jiacheng Liu","orcid":"https://orcid.org/0009-0005-9184-9767"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jiacheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133341892","display_name":"Qiyuan Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Qiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133362178","display_name":"Xintong Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xintong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133327974","display_name":"Chao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133393966","display_name":"Sirui Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Sirui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5133340694","display_name":"Yike Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Yike","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5133324708"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3296999931335449,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3296999931335449,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.07890000194311142,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10232","display_name":"Optical Network Technologies","score":0.06419999897480011,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.7321000099182129},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.7286999821662903},{"id":"https://openalex.org/keywords/outlier","display_name":"Outlier","score":0.6833000183105469},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5238000154495239},{"id":"https://openalex.org/keywords/anomaly-detection","display_name":"Anomaly detection","score":0.4964999854564667},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.46389999985694885},{"id":"https://openalex.org/keywords/vector-quantization","display_name":"Vector quantization","score":0.438400000333786}],"concepts":[{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.7321000099182129},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.7286999821662903},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.6833000183105469},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6370000243186951},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5238000154495239},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.4964999854564667},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.46389999985694885},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.45239999890327454},{"id":"https://openalex.org/C199833920","wikidata":"https://www.wikidata.org/wiki/Q612536","display_name":"Vector quantization","level":2,"score":0.438400000333786},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.39169999957084656},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.3905999958515167},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.37779998779296875},{"id":"https://openalex.org/C2779922397","wikidata":"https://www.wikidata.org/wiki/Q5014755","display_name":"CVAR","level":4,"score":0.37770000100135803},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.376800000667572},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3276999890804291},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.2912999987602234},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26420000195503235}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.07888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.07888","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.07888","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Training":[0],"LLMs":[1],"at":[2],"ultra-low":[3],"precision":[4,61],"remains":[5],"a":[6,41,80,88,174],"formidable":[7],"challenge.":[8],"Direct":[9],"low-bit":[10,69],"QAT":[11,43],"often":[12],"suffers":[13],"from":[14,25],"convergence":[15],"instability":[16],"and":[17,29,149,168],"substantial":[18],"training":[19,58],"costs,":[20],"exacerbated":[21],"by":[22,63],"quantization":[23,76,104],"noise":[24],"heavy-tailed":[26],"outlier":[27,46,99],"channels":[28],"error":[30,105],"accumulation":[31],"across":[32],"layers.":[33],"To":[34,134],"address":[35,135],"these":[36],"issues,":[37],"we":[38,118,142],"present":[39],"Bit-by-Bit,":[40],"progressive":[42,57],"framework":[44],"with":[45,122,131],"channel":[47,100],"splitting.":[48],"Our":[49],"approach":[50],"integrates":[51],"three":[52],"key":[53],"components:":[54],"(1)":[55],"block-wise":[56],"that":[59,112],"reduces":[60],"stage":[62],"stage,":[64],"ensuring":[65],"stable":[66],"initialization":[67],"for":[68,146],"optimization;":[70],"(2)":[71],"nested":[72],"structure":[73],"of":[74,138,176],"integer":[75],"grids":[77],"to":[78,91,154,182],"enable":[79],"\"train":[81],"once,":[82],"deploy":[83],"any":[84],"precision\"":[85],"paradigm,":[86],"allowing":[87],"single":[89],"model":[90],"support":[92],"multiple":[93],"bit-widths":[94],"without":[95],"retraining;":[96],"(3)":[97],"rounding-aware":[98],"splitting,":[101],"which":[102],"mitigates":[103],"while":[106],"acting":[107],"as":[108],"an":[109],"identity":[110],"transform":[111],"preserves":[113],"the":[114,136],"quantized":[115],"outputs.":[116],"Furthermore,":[117],"follow":[119],"microscaling":[120],"groups":[121],"E4M3":[123],"scales,":[124],"capturing":[125],"dynamic":[126],"activation":[127],"ranges":[128],"in":[129],"alignment":[130],"OCP/NVIDIA":[132],"standards.":[133],"lack":[137],"efficient":[139],"2-bit":[140],"kernels,":[141],"developed":[143],"custom":[144],"operators":[145],"both":[147,171],"W2A2":[148,160],"W2A16":[150],"configurations,":[151],"achieving":[152,173],"up":[153],"11$\\times$":[155],"speedup":[156],"over":[157],"BF16.":[158],"Under":[159],"settings,":[161],"Bit-by-Bit":[162],"significantly":[163],"outperforms":[164],"baselines":[165],"like":[166],"BitDistiller":[167],"EfficientQAT":[169],"on":[170],"Llama2/3,":[172],"loss":[175],"only":[177],"2.25":[178],"WikiText2":[179],"PPL":[180],"compared":[181],"full-precision":[183],"models.":[184]},"counts_by_year":[],"updated_date":"2026-04-11T06:19:08.300824","created_date":"2026-04-11T00:00:00"}
