{"id":"https://openalex.org/W3200075413","doi":"https://doi.org/10.1109/hpec49654.2021.9622851","title":"Reconfigurable Low-latency Memory System for Sparse Matricized Tensor Times Khatri-Rao Product on FPGA","display_name":"Reconfigurable Low-latency Memory System for Sparse Matricized Tensor Times Khatri-Rao Product on FPGA","publication_year":2021,"publication_date":"2021-09-20","ids":{"openalex":"https://openalex.org/W3200075413","doi":"https://doi.org/10.1109/hpec49654.2021.9622851","mag":"3200075413"},"language":"en","primary_location":{"id":"doi:10.1109/hpec49654.2021.9622851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec49654.2021.9622851","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036985229","display_name":"Sasindu Wijeratne","orcid":"https://orcid.org/0000-0002-5538-2988"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sasindu Wijeratne","raw_affiliation_strings":["University of Southern California,Department of Electrical and Computer Engineering,Los Angeles,USA"],"affiliations":[{"raw_affiliation_string":"University of Southern California,Department of Electrical and Computer Engineering,Los Angeles,USA","institution_ids":["https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042560222","display_name":"Rajgopal Kannan","orcid":"https://orcid.org/0000-0001-8736-3012"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rajgopal Kannan","raw_affiliation_strings":["US Army Research Lab,Los Angeles,USA","US Army Research Lab, Los Angeles, USA"],"affiliations":[{"raw_affiliation_string":"US Army Research Lab,Los Angeles,USA","institution_ids":[]},{"raw_affiliation_string":"US Army Research Lab, Los Angeles, USA","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5033166029","display_name":"Viktor K. Prasanna","orcid":"https://orcid.org/0000-0002-1609-8589"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Viktor Prasanna","raw_affiliation_strings":["University of Southern California,Department of Electrical and Computer Engineering,Los Angeles,USA"],"affiliations":[{"raw_affiliation_string":"University of Southern California,Department of Electrical and Computer Engineering,Los Angeles,USA","institution_ids":["https://openalex.org/I1174212"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5036985229"],"corresponding_institution_ids":["https://openalex.org/I1174212"],"apc_list":null,"apc_paid":null,"fwci":0.63012027,"has_fulltext":false,"cited_by_count":7,"citation_normalized_percentile":{"value":0.63647343,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"7"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12303","display_name":"Tensor decomposition and applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/2605","display_name":"Computational Mathematics"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12326","display_name":"Network Packet Processing and Optimization","score":0.9531999826431274,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8138810992240906},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7075908184051514},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.565100908279419},{"id":"https://openalex.org/keywords/cas-latency","display_name":"CAS latency","score":0.5626459121704102},{"id":"https://openalex.org/keywords/uniform-memory-access","display_name":"Uniform memory access","score":0.528198778629303},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.5276395678520203},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5249000787734985},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.48657500743865967},{"id":"https://openalex.org/keywords/memory-controller","display_name":"Memory controller","score":0.4854155480861664},{"id":"https://openalex.org/keywords/locality","display_name":"Locality","score":0.4392220675945282},{"id":"https://openalex.org/keywords/distributed-memory","display_name":"Distributed memory","score":0.430306077003479},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.37533411383628845},{"id":"https://openalex.org/keywords/shared-memory","display_name":"Shared memory","score":0.3683364987373352},{"id":"https://openalex.org/keywords/memory-management","display_name":"Memory management","score":0.3560335338115692},{"id":"https://openalex.org/keywords/computer-hardware","display_name":"Computer hardware","score":0.3493103086948395},{"id":"https://openalex.org/keywords/semiconductor-memory","display_name":"Semiconductor memory","score":0.1585349440574646},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.10121974349021912}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8138810992240906},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7075908184051514},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.565100908279419},{"id":"https://openalex.org/C189930140","wikidata":"https://www.wikidata.org/wiki/Q1112878","display_name":"CAS latency","level":4,"score":0.5626459121704102},{"id":"https://openalex.org/C51290061","wikidata":"https://www.wikidata.org/wiki/Q1936765","display_name":"Uniform memory access","level":4,"score":0.528198778629303},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.5276395678520203},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5249000787734985},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.48657500743865967},{"id":"https://openalex.org/C100800780","wikidata":"https://www.wikidata.org/wiki/Q1175867","display_name":"Memory controller","level":3,"score":0.4854155480861664},{"id":"https://openalex.org/C2779808786","wikidata":"https://www.wikidata.org/wiki/Q6664603","display_name":"Locality","level":2,"score":0.4392220675945282},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.430306077003479},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.37533411383628845},{"id":"https://openalex.org/C133875982","wikidata":"https://www.wikidata.org/wiki/Q764810","display_name":"Shared memory","level":2,"score":0.3683364987373352},{"id":"https://openalex.org/C176649486","wikidata":"https://www.wikidata.org/wiki/Q2308807","display_name":"Memory management","level":3,"score":0.3560335338115692},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3493103086948395},{"id":"https://openalex.org/C98986596","wikidata":"https://www.wikidata.org/wiki/Q1143031","display_name":"Semiconductor memory","level":2,"score":0.1585349440574646},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.10121974349021912},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpec49654.2021.9622851","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpec49654.2021.9622851","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE High Performance Extreme Computing Conference (HPEC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":35,"referenced_works":["https://openalex.org/W1512387364","https://openalex.org/W2037594831","https://openalex.org/W2092608522","https://openalex.org/W2098220211","https://openalex.org/W2108138101","https://openalex.org/W2119477966","https://openalex.org/W2462521798","https://openalex.org/W2469230926","https://openalex.org/W2583784797","https://openalex.org/W2913422047","https://openalex.org/W2917336187","https://openalex.org/W2921973771","https://openalex.org/W2948703217","https://openalex.org/W2950001020","https://openalex.org/W2962796306","https://openalex.org/W2971508019","https://openalex.org/W2983890558","https://openalex.org/W3005990752","https://openalex.org/W3015509582","https://openalex.org/W3015641590","https://openalex.org/W3016735325","https://openalex.org/W3090480494","https://openalex.org/W3093099168","https://openalex.org/W3096024647","https://openalex.org/W3103400026","https://openalex.org/W3113725485","https://openalex.org/W3176915718","https://openalex.org/W3193014684","https://openalex.org/W3204626696","https://openalex.org/W4255450819","https://openalex.org/W6718787165","https://openalex.org/W6732831128","https://openalex.org/W6765873015","https://openalex.org/W6783795754","https://openalex.org/W6783938941"],"related_works":["https://openalex.org/W2026512611","https://openalex.org/W4245497162","https://openalex.org/W1985165680","https://openalex.org/W2604972926","https://openalex.org/W2353073543","https://openalex.org/W2353146130","https://openalex.org/W1933089384","https://openalex.org/W3151393245","https://openalex.org/W2150064838","https://openalex.org/W233533876"],"abstract_inverted_index":{"Tensor":[0,18],"decomposition":[1],"has":[2],"become":[3],"an":[4],"essential":[5],"tool":[6],"in":[7,10,31,89],"many":[8],"applications":[9],"various":[11],"domains,":[12],"including":[13],"machine":[14],"learning.":[15],"Sparse":[16],"Matricized":[17],"Times":[19],"Khatri-Rao":[20],"Product":[21],"(MTTKRP)":[22],"is":[23,40],"one":[24],"of":[25,68,72,84],"the":[26,63,69,82,85,90,98,104,123],"most":[27],"computationally":[28],"expensive":[29],"kernels":[30],"tensor":[32],"computations.":[33],"Despite":[34],"having":[35],"significant":[36],"computational":[37],"parallelism,":[38],"MTTKRP":[39,99],"a":[41,57,110],"challenging":[42],"kernel":[43],"to":[44,47],"optimize":[45],"due":[46],"its":[48],"irregular":[49],"memory":[50,59,106,124,132,148],"access":[51,107,125],"characteristics.":[52],"This":[53],"paper":[54],"focuses":[55],"on":[56,81],"multi-faceted":[58],"system,":[60],"which":[61],"explores":[62],"spatial":[64],"and":[65,113,140,146],"temporal":[66],"locality":[67],"data":[70,100],"structures":[71,101],"MTTKRP.":[73],"Further,":[74],"users":[75],"can":[76],"reconfigure":[77],"our":[78,120,136],"design":[79],"depending":[80],"behavior":[83],"compute":[86],"units":[87],"used":[88],"FPGA":[91],"accelerator.":[92],"Our":[93],"system":[94,137],"efficiently":[95],"accesses":[96],"all":[97],"while":[102],"reducing":[103],"total":[105],"time,":[108],"using":[109],"distributed":[111],"cache":[112],"Direct":[114],"Memory":[115],"Access":[116],"(DMA)":[117],"subsystem.":[118],"Moreover,":[119],"work":[121],"improves":[122],"time":[126],"by":[127],"3.5\u00d7":[128],"compared":[129,143],"with":[130,144],"commercial":[131],"controller":[133],"IPs.":[134],"Also,":[135],"shows":[138],"2\u00d7":[139],"1.26\u00d7":[141],"speedups":[142],"cache-only":[145],"DMA-only":[147],"systems,":[149],"respectively.":[150]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
