{"id":"https://openalex.org/W4416963479","doi":"https://doi.org/10.1109/islped65674.2025.11261754","title":"Faster Ternary and Binary Neural Network Inference on CPU by Reducing Popcount Overhead","display_name":"Faster Ternary and Binary Neural Network Inference on CPU by Reducing Popcount Overhead","publication_year":2025,"publication_date":"2025-08-06","ids":{"openalex":"https://openalex.org/W4416963479","doi":"https://doi.org/10.1109/islped65674.2025.11261754"},"language":null,"primary_location":{"id":"doi:10.1109/islped65674.2025.11261754","is_oa":false,"landing_page_url":"https://doi.org/10.1109/islped65674.2025.11261754","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112138421","display_name":"Olivier Fischer","orcid":null},"institutions":[{"id":"https://openalex.org/I4210123994","display_name":"SystemsX.ch","ror":"https://ror.org/02s09gf16","country_code":"CH","type":"facility","lineage":["https://openalex.org/I4210123994"]},{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":true,"raw_author_name":"Olivier Fischer","raw_affiliation_strings":["ETH Zurich,Systems Group,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"ETH Zurich,Systems Group,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088","https://openalex.org/I4210123994"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087479741","display_name":"Shien Zhu","orcid":"https://orcid.org/0000-0002-2094-7643"},"institutions":[{"id":"https://openalex.org/I4210123994","display_name":"SystemsX.ch","ror":"https://ror.org/02s09gf16","country_code":"CH","type":"facility","lineage":["https://openalex.org/I4210123994"]},{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Shien Zhu","raw_affiliation_strings":["ETH Zurich,Systems Group,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"ETH Zurich,Systems Group,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088","https://openalex.org/I4210123994"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5103144919","display_name":"Gustavo Alonso","orcid":"https://orcid.org/0000-0002-4396-6695"},"institutions":[{"id":"https://openalex.org/I4210123994","display_name":"SystemsX.ch","ror":"https://ror.org/02s09gf16","country_code":"CH","type":"facility","lineage":["https://openalex.org/I4210123994"]},{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Gustavo Alonso","raw_affiliation_strings":["ETH Zurich,Systems Group,Zurich,Switzerland"],"affiliations":[{"raw_affiliation_string":"ETH Zurich,Systems Group,Zurich,Switzerland","institution_ids":["https://openalex.org/I35440088","https://openalex.org/I4210123994"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5112138421"],"corresponding_institution_ids":["https://openalex.org/I35440088","https://openalex.org/I4210123994"],"apc_list":null,"apc_paid":null,"fwci":1.1715,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.8500329,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.8382999897003174,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.8382999897003174,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.025100000202655792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.01269999984651804,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bitwise-operation","display_name":"Bitwise operation","score":0.8654000163078308},{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.5903000235557556},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5684999823570251},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5497999787330627},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.46380001306533813},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.4528999924659729},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4296000003814697},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.41909998655319214},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.4009000062942505}],"concepts":[{"id":"https://openalex.org/C134765980","wikidata":"https://www.wikidata.org/wiki/Q879126","display_name":"Bitwise operation","level":2,"score":0.8654000163078308},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8392999768257141},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.5903000235557556},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5684999823570251},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5497999787330627},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.49729999899864197},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.46380001306533813},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.4528999924659729},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.44279998540878296},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4296000003814697},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.41909998655319214},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.4009000062942505},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.35120001435279846},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.33000001311302185},{"id":"https://openalex.org/C43364308","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Byte","level":2,"score":0.32679998874664307},{"id":"https://openalex.org/C11799548","wikidata":"https://www.wikidata.org/wiki/Q6675847","display_name":"Loop tiling","level":3,"score":0.32440000772476196},{"id":"https://openalex.org/C3826847","wikidata":"https://www.wikidata.org/wiki/Q188768","display_name":"FLOPS","level":2,"score":0.32350000739097595},{"id":"https://openalex.org/C86251818","wikidata":"https://www.wikidata.org/wiki/Q816754","display_name":"Benchmarking","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.2759999930858612},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.26739999651908875},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2623000144958496},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.25589999556541443},{"id":"https://openalex.org/C185529760","wikidata":"https://www.wikidata.org/wiki/Q164307","display_name":"Binary operation","level":2,"score":0.2549999952316284},{"id":"https://openalex.org/C2776445388","wikidata":"https://www.wikidata.org/wiki/Q6716998","display_name":"MODTRAN","level":3,"score":0.25270000100135803},{"id":"https://openalex.org/C77390884","wikidata":"https://www.wikidata.org/wiki/Q217302","display_name":"Application-specific integrated circuit","level":2,"score":0.2524000108242035}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/islped65674.2025.11261754","is_oa":false,"landing_page_url":"https://doi.org/10.1109/islped65674.2025.11261754","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Symposium on Low Power Electronics and Design (ISLPED)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":17,"referenced_works":["https://openalex.org/W2555913832","https://openalex.org/W2618939455","https://openalex.org/W2886991855","https://openalex.org/W2887447938","https://openalex.org/W2896934366","https://openalex.org/W2982165597","https://openalex.org/W2997394738","https://openalex.org/W3034933748","https://openalex.org/W3035232708","https://openalex.org/W3093035500","https://openalex.org/W3128120039","https://openalex.org/W3135547017","https://openalex.org/W3137147200","https://openalex.org/W3173877717","https://openalex.org/W4210486199","https://openalex.org/W4214539264","https://openalex.org/W4404893117"],"related_works":[],"abstract_inverted_index":{"Quantization":[0],"is":[1,232],"a":[2,125,204],"widely":[3],"adopted":[4],"method":[5,128],"of":[6,10,95,107,250],"reducing":[7],"resource":[8],"consumption":[9],"neural":[11],"network":[12],"models":[13],"while":[14],"maintaining":[15],"good":[16],"model":[17],"accuracy.":[18],"Ternary":[19],"and":[20,25,34,55,61,70,110,132,136,151,182,196,213,220,256,266,274,284],"Binary":[21],"Neural":[22,58],"Networks":[23,59],"(TNNs":[24],"BNNs)":[26],"can":[27],"be":[28],"implemented":[29],"by":[30,146,172,194],"lightweight":[31],"bitwise":[32,47,99,166],"operations":[33,97],"are":[35],"thus":[36],"very":[37],"suitable":[38],"for":[39,50,91,129,170,210,263],"edge":[40],"platforms.":[41],"Existing":[42],"efforts":[43],"mainly":[44],"optimize":[45,141],"the":[46,78,88,103,117,142,148,158,174,190,238,261],"computation":[48],"algorithms":[49],"BNN":[51],"inference.":[52],"However,":[53],"TNNs":[54],"mixed-precision":[56],"Ternary-Binary":[57],"(TBNs":[60],"BTNs)":[62],"still":[63],"lack":[64],"optimized":[65],"computing":[66],"libraries":[67],"on":[68,134,242,254,258,281],"AVX2":[69,135,171,243,255],"ARM":[71,137,259],"CPUs.":[72,138,244],"Their":[73],"data":[74,79,85,143,159,185],"preparation":[75,144],"walks":[76],"through":[77],"multiple":[80],"times,":[81],"resulting":[82],"in":[83,98,114],"low":[84],"locality.":[86,160],"Moreover,":[87,268],"popcount":[89,104,176],"accounts":[90],"up":[92,233,251],"to":[93,156,234,252,279],"28%":[94],"total":[96,191],"matrix":[100,167,229],"multiplication,":[101],"but":[102],"has":[105],"throughput":[106],"only":[108],"1":[109],"no":[111],"SIMD":[112,180],"instructions":[113,177,181],"AVX2,":[115],"becoming":[116],"central":[118],"performance":[119],"bottleneck.In":[120],"this":[121],"paper,":[122],"we":[123,140,162,202,269],"propose":[124,163],"faster":[126,236],"inference":[127],"TNNs,":[130,211,264],"TBNs,":[131,212,265],"BTNs":[133,214],"First,":[139],"stage":[145],"fusing":[147],"quantization,":[149],"bit-packing,":[150],"image-to-row":[152],"into":[153],"one":[154],"loop":[155,221],"improve":[157],"Second,":[161],"an":[164],"efficient":[165],"multiplication":[168,230],"algorithm":[169,188,231],"replacing":[173],"low-throughput":[175],"with":[178,215],"high-throughput":[179],"applying":[183],"new":[184,228],"encoding.":[186],"This":[187],"reduces":[189],"instruction":[192],"count":[193],"15%":[195],"brings":[197],"2.2\u00d7":[198],"theoretical":[199],"speedup.":[200],"Third,":[201],"implement":[203],"fast":[205],"C++":[206],"inferen":[207],"ce":[208],"library":[209],"standard":[216],"optimizations":[217],"like":[218],"blocking":[219],"unrolling.":[222],"Benchmarking":[223],"results":[224],"show":[225],"that":[226],"our":[227],"2.1\u00d7":[235],"than":[237],"related":[239],"work":[240],"TAB":[241,280],"We":[245],"further":[246],"achieve":[247,270],"layer-level":[248],"speedup":[249,273],"2.7\u00d7":[253],"2.3\u00d7":[257],"over":[260],"baseline":[262],"BTNs.":[267],"1.3-1.9\u00d7":[271],"end-to-end":[272],"1.2-1.8\u00d7":[275],"energy":[276],"efficiency":[277],"compared":[278],"Resnet,":[282],"Darknet,":[283],"VGG":[285],"models.":[286]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-18T07:56:08.524223","created_date":"2025-12-03T00:00:00"}
