{"id":"https://openalex.org/W7116357112","doi":"https://doi.org/10.1145/3754598.3754619","title":"Optimizing Direct Convolutions on High-Performance Multi-Core DSPs","display_name":"Optimizing Direct Convolutions on High-Performance Multi-Core DSPs","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116357112","doi":"https://doi.org/10.1145/3754598.3754619"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754619","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754619","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754619","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Pengyu Wang","orcid":"https://orcid.org/0000-0003-2805-0862"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Pengyu Wang","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0003-2805-0862","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120901582","display_name":"Xiaotian Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaotian Chen","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0009-0001-5996-0087","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083171604","display_name":"Jianbin Fang","orcid":"https://orcid.org/0000-0003-3542-4869"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianbin Fang","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0003-3542-4869","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050112008","display_name":"P. L. Zhang","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Zhang","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0001-8364-9793","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008369819","display_name":"Yonggang Che","orcid":"https://orcid.org/0000-0001-6906-4940"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yonggang Che","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0001-6906-4940","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103712557","display_name":"C. L. Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chun Huang","raw_affiliation_strings":["National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0002-0317-8192","affiliations":[{"raw_affiliation_string":"National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5023610301","display_name":"Jie Ren","orcid":"https://orcid.org/0000-0003-3183-7228"},"institutions":[{"id":"https://openalex.org/I88830068","display_name":"Shaanxi Normal University","ror":"https://ror.org/0170z8493","country_code":"CN","type":"education","lineage":["https://openalex.org/I88830068"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Ren","raw_affiliation_strings":["Shaanxi Normal University, Xi'an, China"],"raw_orcid":"https://orcid.org/0000-0003-3183-7228","affiliations":[{"raw_affiliation_string":"Shaanxi Normal University, Xi'an, China","institution_ids":["https://openalex.org/I88830068"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.6394753,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"146","last_page":"156"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.5194000005722046,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.5194000005722046,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.17249999940395355,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.061000000685453415,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.7390999794006348},{"id":"https://openalex.org/keywords/digital-signal-processing","display_name":"Digital signal processing","score":0.6381999850273132},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5776000022888184},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.44369998574256897},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4106999933719635},{"id":"https://openalex.org/keywords/digital-signal-processor","display_name":"Digital signal processor","score":0.36959999799728394},{"id":"https://openalex.org/keywords/signal-processing","display_name":"Signal processing","score":0.34200000762939453}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8079000115394592},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.7390999794006348},{"id":"https://openalex.org/C84462506","wikidata":"https://www.wikidata.org/wiki/Q173142","display_name":"Digital signal processing","level":2,"score":0.6381999850273132},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5776000022888184},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.571399986743927},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.44369998574256897},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.4307999908924103},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.41179999709129333},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4106999933719635},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4058000147342682},{"id":"https://openalex.org/C161611012","wikidata":"https://www.wikidata.org/wiki/Q106370","display_name":"Digital signal processor","level":3,"score":0.36959999799728394},{"id":"https://openalex.org/C104267543","wikidata":"https://www.wikidata.org/wiki/Q208163","display_name":"Signal processing","level":3,"score":0.34200000762939453},{"id":"https://openalex.org/C557945733","wikidata":"https://www.wikidata.org/wiki/Q389772","display_name":"Data transmission","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C2776175482","wikidata":"https://www.wikidata.org/wiki/Q1195816","display_name":"Transfer (computing)","level":2,"score":0.3142000138759613},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3050000071525574},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.29330000281333923},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.28929999470710754},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2660999894142151},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.2639000117778778},{"id":"https://openalex.org/C24326235","wikidata":"https://www.wikidata.org/wiki/Q126095","display_name":"Electronic engineering","level":1,"score":0.25529998540878296},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.25440001487731934}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754619","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754619","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754619","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754619","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.8696066737174988,"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7"}],"awards":[{"id":"https://openalex.org/G7955248992","display_name":null,"funder_award_id":"2023YFB3001503","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G8867034476","display_name":null,"funder_award_id":"62272474","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":31,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2043275593","https://openalex.org/W2073061372","https://openalex.org/W2154790323","https://openalex.org/W2172654076","https://openalex.org/W2194775991","https://openalex.org/W2395611524","https://openalex.org/W2560023338","https://openalex.org/W2605019159","https://openalex.org/W2898890919","https://openalex.org/W2906043559","https://openalex.org/W2921704024","https://openalex.org/W2973166032","https://openalex.org/W2995113061","https://openalex.org/W3016542674","https://openalex.org/W3123054690","https://openalex.org/W3126707779","https://openalex.org/W3147109109","https://openalex.org/W3174529902","https://openalex.org/W3176085048","https://openalex.org/W3184382748","https://openalex.org/W4220822006","https://openalex.org/W4242759096","https://openalex.org/W4282959697","https://openalex.org/W4311119077","https://openalex.org/W4312947129","https://openalex.org/W4327694885","https://openalex.org/W4378227035","https://openalex.org/W4386106891","https://openalex.org/W4386570406","https://openalex.org/W4391958657"],"related_works":[],"abstract_inverted_index":{"Convolution":[0],"operations":[1],"form":[2],"the":[3,79,114,127],"computational":[4],"backbone":[5],"of":[6,126,143],"deep":[7],"learning":[8],"inference":[9],"but":[10],"often":[11],"become":[12],"performance":[13,65,117,130],"bottlenecks":[14],"on":[15,106,131],"conventional":[16],"architectures.":[17],"While":[18],"multi-core":[19,60],"Digital":[20],"Signal":[21],"Processors":[22],"(DSPs)":[23],"offer":[24],"energy-efficient":[25],"alternatives":[26],"through":[27],"long":[28,92],"vector":[29,93],"units":[30,94],"and":[31,49,73,83,95,136],"software-managed":[32],"memory":[33,47],"hierarchies,":[34],"existing":[35],"convolution":[36,57,104,120],"optimizations":[37,105],"designed":[38],"for":[39,59],"CPUs/GPUs":[40],"underperform":[41],"due":[42],"to":[43,85,124],"architectural":[44],"mismatches":[45],"in":[46],"systems":[48],"execution":[50],"pipelines.":[51],"We":[52,99],"present":[53],"mtConv,":[54],"an":[55],"optimizing":[56],"method":[58],"DSPs.":[61,107],"mtConv":[62,101,112],"achieves":[63],"high":[64],"by":[66],"exploiting":[67],"data":[68,87],"reuse,":[69],"managing":[70],"on-chip":[71],"memory,":[72],"designing":[74],"efficient":[75],"micro-kernels.":[76],"It":[77],"maximizes":[78],"overlap":[80],"between":[81],"computation":[82],"communication":[84],"hide":[86],"transfer":[88],"latency,":[89],"leveraging":[90],"DSPs\u2019":[91],"hierarchical":[96],"scratchpad":[97],"memories.":[98],"evaluate":[100],"against":[102],"state-of-the-art":[103],"Experimental":[108],"results":[109],"show":[110],"that":[111],"delivers":[113],"best":[115],"overall":[116],"across":[118],"various":[119],"layers,":[121],"achieving":[122],"up":[123],"93.25%":[125],"hardware\u2019s":[128],"peak":[129],"a":[132,144],"single":[133,145],"DSP":[134,146],"core":[135],"92.31%":[137],"when":[138],"using":[139],"all":[140],"8":[141],"cores":[142],"cluster.":[147]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2025-12-21T00:00:00"}
