{"id":"https://openalex.org/W7133333059","doi":"https://doi.org/10.48550/arxiv.2603.00040","title":"Attn-QAT: 4-Bit Attention With Quantization-Aware Training","display_name":"Attn-QAT: 4-Bit Attention With Quantization-Aware Training","publication_year":2026,"publication_date":"2026-02-09","ids":{"openalex":"https://openalex.org/W7133333059","doi":"https://doi.org/10.48550/arxiv.2603.00040"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2603.00040","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001609288","display_name":"Peiyuan Zhang","orcid":"https://orcid.org/0000-0003-4086-3436"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Peiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079083596","display_name":"Matthew Noto","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Noto, Matthew","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127916529","display_name":"Wenxuan Tan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tan, Wenxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021566476","display_name":"Chengquan Jiang","orcid":"https://orcid.org/0009-0004-9356-6034"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Chengquan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5122971250","display_name":"Will Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Will","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128034810","display_name":"Wei Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5127891865","display_name":"Hao Zhang (15339)","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Hao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5001609288"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.40779998898506165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.40779998898506165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.09960000216960907,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.06949999928474426,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.6046000123023987},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5584999918937683},{"id":"https://openalex.org/keywords/obstacle","display_name":"Obstacle","score":0.557699978351593},{"id":"https://openalex.org/keywords/heuristics","display_name":"Heuristics","score":0.546999990940094},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5422999858856201},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5378999710083008},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5196999907493591}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7034000158309937},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.6046000123023987},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5584999918937683},{"id":"https://openalex.org/C2776650193","wikidata":"https://www.wikidata.org/wiki/Q264661","display_name":"Obstacle","level":2,"score":0.557699978351593},{"id":"https://openalex.org/C127705205","wikidata":"https://www.wikidata.org/wiki/Q5748245","display_name":"Heuristics","level":2,"score":0.546999990940094},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5422999858856201},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5378999710083008},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5196999907493591},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5148000121116638},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.5127999782562256},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.42660000920295715},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.4185999929904938},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4180999994277954},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.3521000146865845},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.29420000314712524},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2603.00040","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2603.00040","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00040","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2603.00040","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Achieving":[0],"reliable":[1],"4-bit":[2,39],"attention":[3,16,83,131],"is":[4],"a":[5,58,145],"prerequisite":[6],"for":[7,43,74,111],"end-to-end":[8],"FP4":[9,54,76,116,130,139],"computation":[10],"on":[11,100,148],"emerging":[12],"FP4-capable":[13],"GPUs,":[14],"yet":[15],"remains":[17],"the":[18,34,86,126],"main":[19],"obstacle":[20],"due":[21],"to":[22,66,144],"FP4's":[23],"tiny":[24],"dynamic":[25],"range":[26],"and":[27,89,106,121,141],"attention's":[28],"heavy-tailed":[29],"activations.":[30],"This":[31],"paper":[32],"presents":[33],"first":[35],"systematic":[36],"study":[37],"of":[38,82],"quantization-aware":[40],"training":[41,67,112],"(QAT)":[42],"attention.":[44],"We":[45,69],"find":[46],"that":[47],"\"drop-in\"":[48],"QAT,":[49],"which":[50],"naively":[51],"combines":[52],"an":[53,149],"forward":[55],"pass":[56],"with":[57],"high-precision":[59],"Flash":[60],"Attention":[61],"(FA)-style":[62],"backward":[63,87],"pass,":[64,88],"leads":[65],"instability.":[68],"identify":[70],"two":[71],"key":[72],"principles":[73],"stable":[75],"attention:":[77],"(1)":[78],"matching":[79],"low-precision":[80],"recomputation":[81],"scores":[84],"in":[85,95,137],"(2)":[90],"resolving":[91],"implicit":[92],"precision":[93],"assumptions":[94],"FA's":[96],"gradient":[97],"calculation.":[98],"Based":[99],"these":[101],"insights,":[102],"we":[103],"propose":[104],"Attn-QAT":[105,124],"implement":[107],"fused":[108],"Triton":[109],"kernels":[110],"as":[113,115],"well":[114],"inference":[117],"kernels.":[118],"Across":[119],"diffusion":[120],"language":[122],"models,":[123],"recovers":[125],"quality":[127],"drop":[128],"from":[129],"without":[132],"explicit":[133],"outlier-mitigation":[134],"heuristics":[135],"used":[136],"prior":[138],"attention,":[140],"delivers":[142],"up":[143],"1.5x":[146],"speedup":[147],"RTX":[150],"5090.":[151],"Video":[152],"demos":[153],"can":[154],"be":[155],"found":[156],"at":[157],"https://drive.google.com/drive/folders/190F6xbBDUF2kGQYIcXBt3ehSYij5jlim?usp=sharing.":[158]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-03-04T00:00:00"}
