{"id":"https://openalex.org/W7151339693","doi":"https://doi.org/10.48550/arxiv.2604.03957","title":"BWTA: Accurate and Efficient Binarized Transformer by Algorithm-Hardware Co-design","display_name":"BWTA: Accurate and Efficient Binarized Transformer by Algorithm-Hardware Co-design","publication_year":2026,"publication_date":"2026-04-05","ids":{"openalex":"https://openalex.org/W7151339693","doi":"https://doi.org/10.48550/arxiv.2604.03957"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.03957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.03957","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133141436","display_name":"Yifu Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Yifu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133099913","display_name":"Xianglong Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Xianglong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133081608","display_name":"Shenghao Jin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Shenghao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133072057","display_name":"Jinyang Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Jinyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5103740674","display_name":"Jiwen Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Jiwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.41659998893737793,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.41659998893737793,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10901","display_name":"Advanced Data Compression Techniques","score":0.18119999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.06549999862909317,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.9097999930381775},{"id":"https://openalex.org/keywords/memory-footprint","display_name":"Memory footprint","score":0.7551000118255615},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.6003000140190125},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4878000020980835},{"id":"https://openalex.org/keywords/binary-number","display_name":"Binary number","score":0.45680001378059387},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4131999909877777},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.40610000491142273},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.39559999108314514},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.38909998536109924}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.9097999930381775},{"id":"https://openalex.org/C74912251","wikidata":"https://www.wikidata.org/wiki/Q6815727","display_name":"Memory footprint","level":2,"score":0.7551000118255615},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7268000245094299},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.6003000140190125},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5745000243186951},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.48890000581741333},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4878000020980835},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.45680001378059387},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4131999909877777},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.40610000491142273},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4011000096797943},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.39559999108314514},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.38909998536109924},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.3837999999523163},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.33719998598098755},{"id":"https://openalex.org/C64452783","wikidata":"https://www.wikidata.org/wiki/Q1524945","display_name":"Ternary operation","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.31779998540878296},{"id":"https://openalex.org/C134835016","wikidata":"https://www.wikidata.org/wiki/Q690265","display_name":"Lookup table","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.28700000047683716},{"id":"https://openalex.org/C129844170","wikidata":"https://www.wikidata.org/wiki/Q41299","display_name":"Quadratic equation","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.27309998869895935},{"id":"https://openalex.org/C126780896","wikidata":"https://www.wikidata.org/wiki/Q899871","display_name":"Distortion (music)","level":4,"score":0.2727999985218048},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.2696000039577484},{"id":"https://openalex.org/C133095886","wikidata":"https://www.wikidata.org/wiki/Q1307173","display_name":"Single-precision floating-point format","level":3,"score":0.2694000005722046},{"id":"https://openalex.org/C2777767291","wikidata":"https://www.wikidata.org/wiki/Q1080291","display_name":"Sizing","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.25929999351501465},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2540999948978424}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.03957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.03957","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.03957","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Ultra":[0],"low-bit":[1,53],"quantization":[2,39],"brings":[3],"substantial":[4],"efficiency":[5],"for":[6,96,115,138],"Transformer-based":[7],"models,":[8],"but":[9],"the":[10,49],"accuracy":[11,50,137],"degradation":[12],"and":[13,30,47,67,75,91,99,124,132,136,155],"limited":[14],"GPU":[15],"support":[16],"hinder":[17],"its":[18],"wide":[19],"usage.":[20],"In":[21,140],"this":[22],"paper,":[23],"we":[24,57,80],"analyze":[25],"zero-point":[26],"distortion":[27],"in":[28],"binarization":[29],"propose":[31,58],"a":[32,63,68,82],"Binary":[33],"Weights":[34],"&amp;":[35],"Ternary":[36],"Activations":[37],"(BWTA)":[38],"scheme,":[40],"which":[41],"projects":[42],"tiny":[43],"values":[44],"to":[45,72,145,157],"zero":[46],"preserves":[48],"of":[51],"extremely":[52],"models.":[54],"For":[55,78],"training,":[56],"Smooth":[59],"Multi-Stage":[60],"Quantization,":[61],"combining":[62],"Levelwise":[64],"Degradation":[65],"Strategy":[66],"Magnitude-Alignment":[69],"Projection":[70],"Factor":[71],"enable":[73],"stable":[74],"fast":[76],"convergence.":[77],"inference,":[79],"develop":[81],"BWTA":[83,111,173],"MatMul":[84,94],"CUDA":[85],"kernel":[86],"with":[87,117,163],"instruction-level":[88],"parallel":[89],"bit-packing":[90],"comprehensive":[92],"binary/ternary":[93],"implementations":[95],"both":[97],"linear":[98],"attention":[100],"operators,":[101],"allowing":[102],"seamless":[103],"integration":[104],"across":[105],"Transformer":[106],"architectures.":[107],"Experiments":[108],"show":[109],"that":[110],"approaches":[112],"full-precision":[113],"performance":[114],"BERT,":[116],"an":[118,170],"average":[119],"3.5%":[120],"drop":[121,128],"on":[122,129,152,167],"GLUE":[123],"less":[125],"than":[126],"2%":[127],"five":[130],"tasks,":[131],"achieves":[133],"comparable":[134],"perplexity":[135],"LLMs.":[139,168],"efficiency,":[141],"it":[142],"delivers":[143],"16":[144],"24":[146],"times":[147],"kernel-level":[148],"speedup":[149,162],"over":[150],"FP16":[151],"NVIDIA":[153],"GPUs,":[154],"216":[156],"330":[158],"tokens/s":[159],"end-to-end":[160],"prefill":[161],"lower":[164],"memory":[165],"footprint":[166],"As":[169],"algorithm-hardware":[171],"co-design,":[172],"demonstrates":[174],"practical,":[175],"low-latency":[176],"ultra-low-bit":[177],"inference":[178],"without":[179],"sacrificing":[180],"model":[181],"quality.":[182]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-04-08T00:00:00"}
