{"id":"https://openalex.org/W7147567968","doi":"https://doi.org/10.48550/arxiv.2603.27462","title":"RSR-core: A High-Performance Engine for Low-Bit Matrix-Vector Multiplication","display_name":"RSR-core: A High-Performance Engine for Low-Bit Matrix-Vector Multiplication","publication_year":2026,"publication_date":"2026-03-29","ids":{"openalex":"https://openalex.org/W7147567968","doi":"https://doi.org/10.48550/arxiv.2603.27462"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.27462","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27462","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.27462","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132630030","display_name":"Mohsen Dehghankar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dehghankar, Mohsen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5027319416","display_name":"Abolfazl Asudeh","orcid":"https://orcid.org/0000-0002-5251-6186"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Asudeh, Abolfazl","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.34279999136924744,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.34279999136924744,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.09189999848604202,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.07339999824762344,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.8174999952316284},{"id":"https://openalex.org/keywords/multiplication","display_name":"Multiplication (music)","score":0.5831999778747559},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5522000193595886},{"id":"https://openalex.org/keywords/block","display_name":"Block (permutation group theory)","score":0.43549999594688416},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.39910000562667847},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.3522999882698059},{"id":"https://openalex.org/keywords/ternary-operation","display_name":"Ternary operation","score":0.34439998865127563},{"id":"https://openalex.org/keywords/reduction","display_name":"Reduction (mathematics)","score":0.33500000834465027},{"id":"https://openalex.org/keywords/assembly-language","display_name":"Assembly language","score":0.32850000262260437}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.8174999952316284},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7688999772071838},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6485000252723694},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.5831999778747559},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5522000193595886},{"id":"https://openalex.org/C2777210771","wikidata":"https://www.wikidata.org/wiki/Q4927124","display_name":"Block (permutation group theory)","level":2,"score":0.43549999594688416},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4147999882698059},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.39910000562667847},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.3522999882698059},{"id":"https://openalex.org/C64452783","wikidata":"https://www.wikidata.org/wiki/Q1524945","display_name":"Ternary operation","level":2,"score":0.34439998865127563},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C50831359","wikidata":"https://www.wikidata.org/wiki/Q165436","display_name":"Assembly language","level":3,"score":0.32850000262260437},{"id":"https://openalex.org/C48372109","wikidata":"https://www.wikidata.org/wiki/Q3913","display_name":"Binary number","level":2,"score":0.3001999855041504},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.2939000129699707},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.29019999504089355},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.2897999882698059},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.2768000066280365},{"id":"https://openalex.org/C164620267","wikidata":"https://www.wikidata.org/wiki/Q376953","display_name":"Adder","level":3,"score":0.2700999975204468},{"id":"https://openalex.org/C134835016","wikidata":"https://www.wikidata.org/wiki/Q690265","display_name":"Lookup table","level":2,"score":0.26989999413490295},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.2687000036239624},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.26579999923706055},{"id":"https://openalex.org/C175291020","wikidata":"https://www.wikidata.org/wiki/Q1156822","display_name":"Offset (computer science)","level":2,"score":0.2653000056743622},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.2632000148296356},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.2619999945163727},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2596000134944916},{"id":"https://openalex.org/C76970557","wikidata":"https://www.wikidata.org/wiki/Q1869750","display_name":"Loop unrolling","level":3,"score":0.25429999828338623},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.2531000077724457},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.25209999084472656},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.25130000710487366}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.27462","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27462","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.27462","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.27462","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Matrix-vector":[0],"multiplication":[1,24,130],"is":[2,54,152,204],"a":[3,20,108,155],"fundamental":[4],"building":[5],"block":[6],"in":[7,56,147],"neural":[8],"networks,":[9],"vector":[10],"databases,":[11],"and":[12,90,123,133,137,165,187],"large":[13],"language":[14],"models,":[15],"particularly":[16],"during":[17],"inference.":[18,31,168],"As":[19],"result,":[21],"efficient":[22,30,62,128],"matrix-vector":[23,80,129],"engines":[25],"directly":[26],"translate":[27],"into":[28,95],"more":[29],"Recent":[32],"work":[33],"has":[34],"explored":[35],"low-bit":[36,79,163],"quantization":[37],"of":[38,144],"model":[39],"weights,":[40],"where":[41],"matrices":[42,136],"are":[43],"represented":[44],"using":[45],"binary":[46,132],"(1-bit)":[47],"or":[48],"ternary":[49,134,199],"(1.58-bit)":[50],"values":[51],"while":[52,140],"activation":[53],"kept":[55],"higher":[57],"precision.":[58],"These":[59],"representations":[60],"enable":[61],"hardware-level":[63],"computation.":[64],"In":[65],"parallel,":[66],"algorithms":[67],"such":[68],"as":[69,116,154],"Redundant":[70],"Segment":[71],"Reduction":[72],"(RSR)":[73],"provide":[74],"theoretical":[75],"guarantees":[76],"for":[77,120,131,161,192,197],"accelerating":[78],"multiplication.":[81],"However,":[82],"existing":[83],"implementations":[84],"operate":[85],"at":[86,207],"the":[87,113],"application":[88],"level":[89],"cannot":[91],"be":[92],"efficiently":[93],"integrated":[94],"hardware":[96],"kernels,":[97],"limiting":[98],"practical":[99,142],"performance.":[100],"To":[101],"bridge":[102],"this":[103],"gap,":[104],"we":[105],"present":[106],"RSR-core,":[107],"high-performance":[109],"engine":[110,157],"that":[111],"implements":[112],"RSR":[114,145],"algorithm":[115,146],"optimized":[117],"low-level":[118],"kernels":[119],"both":[121],"CPU":[122,186],"CUDA":[124,196],"environments.":[125],"RSR-core":[126,151],"supports":[127],"weight":[135],"general":[138],"vectors":[139],"enabling":[141],"deployment":[143],"real":[148],"inference":[149],"pipelines.":[150],"provided":[153],"production-ready":[156],"with":[158],"HuggingFace":[159,177],"integration":[160],"preprocessing":[162],"models":[164],"running":[166],"accelerated":[167],"Experimental":[169],"results":[170],"demonstrate":[171],"significant":[172],"performance":[173],"improvements":[174],"over":[175],"baseline":[176],"PyTorch":[178],"multiplication,":[179],"achieving":[180],"up":[181,188],"to":[182,189],"62x":[183],"speedup":[184,191],"on":[185,195],"1.9x":[190],"token":[193],"generation":[194],"popular":[198],"LLMs.":[200],"The":[201],"source":[202],"code":[203],"publicly":[205],"available":[206],"https://github.com/UIC-InDeXLab/RSR-core.":[208]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-02T00:00:00"}
