{"id":"https://openalex.org/W4415207016","doi":"https://doi.org/10.1145/3771939","title":"SIMD-CP: SIMD with Redundant Bits Compression and Mixed-Precision Packing for Quantized DNNs","display_name":"SIMD-CP: SIMD with Redundant Bits Compression and Mixed-Precision Packing for Quantized DNNs","publication_year":2025,"publication_date":"2025-10-15","ids":{"openalex":"https://openalex.org/W4415207016","doi":"https://doi.org/10.1145/3771939"},"language":"en","primary_location":{"id":"doi:10.1145/3771939","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771939","pdf_url":null,"source":{"id":"https://openalex.org/S136160450","display_name":"ACM Transactions on Embedded Computing Systems","issn_l":"1539-9087","issn":["1539-9087","1558-3465"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Embedded Computing Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.1145/3771939","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050351696","display_name":"Hayata Kaneko","orcid":null},"institutions":[{"id":"https://openalex.org/I135768898","display_name":"Ritsumeikan University","ror":"https://ror.org/0197nmd03","country_code":"JP","type":"education","lineage":["https://openalex.org/I135768898","https://openalex.org/I4390039241"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Hayata Kaneko","raw_affiliation_strings":["Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering"],"affiliations":[{"raw_affiliation_string":"Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering","institution_ids":["https://openalex.org/I135768898"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5053005436","display_name":"Ryuto Ishibashi","orcid":"https://orcid.org/0009-0000-3161-2200"},"institutions":[{"id":"https://openalex.org/I135768898","display_name":"Ritsumeikan University","ror":"https://ror.org/0197nmd03","country_code":"JP","type":"education","lineage":["https://openalex.org/I135768898","https://openalex.org/I4390039241"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Ryuto Ishibashi","raw_affiliation_strings":["Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering"],"affiliations":[{"raw_affiliation_string":"Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering","institution_ids":["https://openalex.org/I135768898"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5076579498","display_name":"Lin Meng","orcid":"https://orcid.org/0000-0003-4351-6923"},"institutions":[{"id":"https://openalex.org/I135768898","display_name":"Ritsumeikan University","ror":"https://ror.org/0197nmd03","country_code":"JP","type":"education","lineage":["https://openalex.org/I135768898","https://openalex.org/I4390039241"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Lin Meng","raw_affiliation_strings":["Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering"],"affiliations":[{"raw_affiliation_string":"Ritsumeikan University College of Science and Engineering Graduate School of Science and Engineering","institution_ids":["https://openalex.org/I135768898"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5050351696"],"corresponding_institution_ids":["https://openalex.org/I135768898"],"apc_list":null,"apc_paid":null,"fwci":1.2134,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.84178526,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"25","issue":"1","first_page":"1","last_page":"20"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10522","display_name":"Medical Imaging Techniques and Applications","score":0.993399977684021,"subfield":{"id":"https://openalex.org/subfields/2741","display_name":"Radiology, Nuclear Medicine and Imaging"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.8271999955177307},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.5304999947547913},{"id":"https://openalex.org/keywords/lookup-table","display_name":"Lookup table","score":0.5304999947547913},{"id":"https://openalex.org/keywords/high-memory","display_name":"High memory","score":0.5169000029563904},{"id":"https://openalex.org/keywords/compression-ratio","display_name":"Compression ratio","score":0.5042999982833862},{"id":"https://openalex.org/keywords/edge-device","display_name":"Edge device","score":0.4377000033855438},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4156000018119812},{"id":"https://openalex.org/keywords/compression","display_name":"Compression (physics)","score":0.39640000462532043},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.37450000643730164},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.37220001220703125}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8507999777793884},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.8271999955177307},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5842999815940857},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.5304999947547913},{"id":"https://openalex.org/C134835016","wikidata":"https://www.wikidata.org/wiki/Q690265","display_name":"Lookup table","level":2,"score":0.5304999947547913},{"id":"https://openalex.org/C2781357197","wikidata":"https://www.wikidata.org/wiki/Q5757597","display_name":"High memory","level":2,"score":0.5169000029563904},{"id":"https://openalex.org/C25797200","wikidata":"https://www.wikidata.org/wiki/Q828137","display_name":"Compression ratio","level":3,"score":0.5042999982833862},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.4377000033855438},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.42080000042915344},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4156000018119812},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.4020000100135803},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.39640000462532043},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.37450000643730164},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.37220001220703125},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3709000051021576},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3700000047683716},{"id":"https://openalex.org/C13481523","wikidata":"https://www.wikidata.org/wiki/Q412438","display_name":"Image compression","level":4,"score":0.34950000047683716},{"id":"https://openalex.org/C55526617","wikidata":"https://www.wikidata.org/wiki/Q719375","display_name":"Operand","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.33180001378059387},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.3260999917984009},{"id":"https://openalex.org/C189930140","wikidata":"https://www.wikidata.org/wiki/Q1112878","display_name":"CAS latency","level":4,"score":0.3176000118255615},{"id":"https://openalex.org/C81081738","wikidata":"https://www.wikidata.org/wiki/Q55542","display_name":"Lossless compression","level":3,"score":0.31220000982284546},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.31139999628067017},{"id":"https://openalex.org/C94835093","wikidata":"https://www.wikidata.org/wiki/Q3113333","display_name":"Data compression ratio","level":5,"score":0.3018999993801117},{"id":"https://openalex.org/C45235069","wikidata":"https://www.wikidata.org/wiki/Q278425","display_name":"Table (database)","level":2,"score":0.29339998960494995},{"id":"https://openalex.org/C117280010","wikidata":"https://www.wikidata.org/wiki/Q180944","display_name":"Register file","level":3,"score":0.2867000102996826},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2847000062465668},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.27889999747276306},{"id":"https://openalex.org/C2778192920","wikidata":"https://www.wikidata.org/wiki/Q16874989","display_name":"Signal compression","level":4,"score":0.2630999982357025},{"id":"https://openalex.org/C84211073","wikidata":"https://www.wikidata.org/wiki/Q117879","display_name":"Floating point","level":2,"score":0.26269999146461487},{"id":"https://openalex.org/C68043766","wikidata":"https://www.wikidata.org/wiki/Q267416","display_name":"Static random-access memory","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.25459998846054077},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.25099998712539673}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3771939","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771939","pdf_url":null,"source":{"id":"https://openalex.org/S136160450","display_name":"ACM Transactions on Embedded Computing Systems","issn_l":"1539-9087","issn":["1539-9087","1558-3465"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Embedded Computing Systems","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3771939","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3771939","pdf_url":null,"source":{"id":"https://openalex.org/S136160450","display_name":"ACM Transactions on Embedded Computing Systems","issn_l":"1539-9087","issn":["1539-9087","1558-3465"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Embedded Computing Systems","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2233304223","https://openalex.org/W2750173518","https://openalex.org/W2770020066","https://openalex.org/W2803682856","https://openalex.org/W2889339831","https://openalex.org/W2913135501","https://openalex.org/W2963122961","https://openalex.org/W2963255460","https://openalex.org/W2980856918","https://openalex.org/W2998218113","https://openalex.org/W3037288590","https://openalex.org/W3103292482","https://openalex.org/W3158020960","https://openalex.org/W3174718738","https://openalex.org/W4280582299","https://openalex.org/W4283585133","https://openalex.org/W4312950730","https://openalex.org/W4324057834","https://openalex.org/W4381050415","https://openalex.org/W4386322953","https://openalex.org/W4389934542","https://openalex.org/W4399770864","https://openalex.org/W4402426679","https://openalex.org/W4404102023","https://openalex.org/W4404635290","https://openalex.org/W4406505715","https://openalex.org/W4408564878","https://openalex.org/W4412889927","https://openalex.org/W4413278534","https://openalex.org/W4414828535"],"related_works":[],"abstract_inverted_index":{"Deploying":[0],"deep":[1],"neural":[2],"networks":[3],"(DNNs)":[4],"on":[5,42,164],"edge":[6,43],"devices":[7],"presents":[8],"notable":[9],"challenges,":[10],"including":[11,218],"execution":[12],"time,":[13],"power":[14],"consumption,":[15],"and":[16,30,55,76,93,131,147,168,178,189,221],"memory":[17,124],"footprint.":[18],"To":[19,79],"address":[20],"these":[21,81],"limitations,":[22],"the":[23,37,46,62,66,142,200,236,239,250,254],"co-design":[24],"of":[25,40,65,145,174,238],"software-based":[26],"model":[27,52,206],"compression":[28,53,57,101,207],"techniques":[29],"dedicated":[31],"hardware":[32,47],"has":[33],"become":[34],"crucial":[35],"for":[36,196,215],"efficient":[38],"deployment":[39],"DNNs":[41],"devices.":[44],"However,":[45],"needs":[48],"to":[49,61],"support":[50],"various":[51],"techniques,":[54],"specific":[56],"formats":[58,170],"introduce":[59,105],"limitations":[60],"effective":[63],"use":[64],"conventional":[67,255],"SIMD,":[68],"such":[69],"as":[70,100,129],"low-bit-width":[71],"precision,":[72,75],"fine-grained":[73,166],"mixed":[74],"sparse":[77],"matrices.":[78],"overcome":[80],"issues,":[82],"we":[83,104],"propose":[84],"SIMD-CP,":[85],"a":[86,111,133,160,172,184,190,244],"SIMD":[87,108],"architecture":[88],"featuring":[89],"tag-based":[90],"precision":[91,143],"detection":[92],"redundant":[94],"bit-width":[95,127],"compression,":[96],"which":[97,119,140],"is":[98],"represented":[99],"packing.":[102],"Specifically,":[103],"two":[106],"novel":[107],"instructions:":[109],"(i)":[110],"tagged":[112],"vector":[113],"load":[114],"instruction":[115,136],"(":[116,137],"tvl":[117,177],"),":[118,139],"fetches":[120],"quantized":[121],"vectors":[122],"from":[123],"while":[125],"appending":[126],"metadata":[128],"tags,":[130],"(ii)":[132],"packing":[134],"dot-product":[135],"pdotp":[138,179],"detects":[141],"levels":[144],"elements":[146],"packs":[148],"them":[149],"into":[150],"suitable":[151],"multipliers.":[152],"Experimental":[153],"evaluations":[154],"show":[155],"that":[156],"our":[157],"approach":[158],"achieves":[159],"2.0\u00d7":[161],"MAC/cycle":[162],"gain":[163],"both":[165],"mixed-precision":[167,197,202,256],"sparse-matrix":[169],"by":[171],"series":[173],"instructions,":[175],"i.e.,":[176],".":[180],"Furthermore,":[181],"SIMD-CP":[182,240],"obtains":[183],"2.70":[185],"\u223c":[186,192,211,231],"3.40\u00d7":[187],"GOPs/W":[188],"2.31":[191],"2.42\u00d7":[193],"OPs/LUT":[194],"improvement":[195],"convolution,":[198],"outperforming":[199],"cutting-edge":[201],"SIMD.":[203,257],"These":[204],"diverse":[205],"supports":[208],"allow":[209],"28.8":[210],"45.5%":[212],"latency":[213],"reduction":[214],"DNN":[216],"applications,":[217],"tiny":[219],"CNN":[220],"edge-aware":[222],"Vision":[223],"Transformer,":[224],"with":[225,253],"mitigating":[226],"accuracy":[227],"degradation":[228],"within":[229],"1.2":[230],"2.1%.":[232],"We":[233],"also":[234],"provide":[235],"scaling":[237],"architecture,":[241],"resulting":[242],"in":[243,249],"1.8%":[245],"LUT":[246],"utilization":[247],"increase":[248],"small-scale":[251],"compared":[252]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-16T00:00:00"}
