{"id":"https://openalex.org/W2949457204","doi":"https://doi.org/10.1109/fccm.2019.00071","title":"OpenCL Kernel Vectorization on the CPU, GPU, and FPGA: A Case Study with Frequent Pattern Compression","display_name":"OpenCL Kernel Vectorization on the CPU, GPU, and FPGA: A Case Study with Frequent Pattern Compression","publication_year":2019,"publication_date":"2019-04-01","ids":{"openalex":"https://openalex.org/W2949457204","doi":"https://doi.org/10.1109/fccm.2019.00071","mag":"2949457204"},"language":"en","primary_location":{"id":"doi:10.1109/fccm.2019.00071","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm.2019.00071","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101790023","display_name":"Zheming Jin","orcid":"https://orcid.org/0000-0002-7197-780X"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zheming Jin","raw_affiliation_strings":["Argonne National Laboratory"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Argonne National Laboratory","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050512119","display_name":"Hal Finkel","orcid":"https://orcid.org/0000-0002-7551-7122"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hal Finkel","raw_affiliation_strings":["Argonne National Laboratory"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Argonne National Laboratory","institution_ids":["https://openalex.org/I1282105669"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.05363146,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"330","last_page":"330"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9958000183105469,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8540663719177246},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.788710355758667},{"id":"https://openalex.org/keywords/software-portability","display_name":"Software portability","score":0.725716769695282},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6882259845733643},{"id":"https://openalex.org/keywords/xeon-phi","display_name":"Xeon Phi","score":0.6184697151184082},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.5330725312232971},{"id":"https://openalex.org/keywords/vectorization","display_name":"Vectorization (mathematics)","score":0.5188297033309937},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4636382460594177},{"id":"https://openalex.org/keywords/coprocessor","display_name":"Coprocessor","score":0.44557714462280273},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.2581809461116791},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.24053847789764404}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8540663719177246},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.788710355758667},{"id":"https://openalex.org/C63000827","wikidata":"https://www.wikidata.org/wiki/Q3080428","display_name":"Software portability","level":2,"score":0.725716769695282},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6882259845733643},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.6184697151184082},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.5330725312232971},{"id":"https://openalex.org/C41681595","wikidata":"https://www.wikidata.org/wiki/Q7917855","display_name":"Vectorization (mathematics)","level":2,"score":0.5188297033309937},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4636382460594177},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.44557714462280273},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.2581809461116791},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.24053847789764404},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/fccm.2019.00071","is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm.2019.00071","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.5899999737739563,"display_name":"Affordable and clean energy"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":1,"referenced_works":["https://openalex.org/W2162752393"],"related_works":["https://openalex.org/W3020739840","https://openalex.org/W2913998709","https://openalex.org/W1908180445","https://openalex.org/W2213533160","https://openalex.org/W4386875822","https://openalex.org/W3177128669","https://openalex.org/W2467043670","https://openalex.org/W2748015092","https://openalex.org/W2952876933","https://openalex.org/W2475036044"],"abstract_inverted_index":{"OpenCL":[0,51,72],"promotes":[1],"code":[2],"portability,":[3],"and":[4,24,68,96,144],"natively":[5],"supports":[6],"vectorized":[7],"data":[8],"types,":[9],"which":[10],"allows":[11],"developers":[12],"to":[13,80,110,139,147],"potentially":[14],"take":[15],"advantage":[16],"of":[17,50,128,154],"the":[18,55,66,71,84,111,122,131,142,150,159],"single-instruction-multiple-data":[19],"instructions":[20],"on":[21,54,86,130,141,149,158],"CPUs,":[22],"GPUs,":[23],"FPGAs.":[25],"FPGAs":[26],"are":[27],"becoming":[28],"a":[29,39,47,75,97,126],"promising":[30],"heterogeneous":[31],"computing":[32,57],"component.":[33],"In":[34],"our":[35,118],"study,":[36],"we":[37],"choose":[38],"kernel":[40,52,73,85,113,123,155],"used":[41],"in":[42],"frequent":[43],"pattern":[44,62],"compression":[45],"as":[46],"case":[48],"study":[49],"vectorizations":[53],"three":[56],"platforms.":[58],"We":[59,82],"describe":[60],"different":[61],"matching":[63],"approaches":[64],"for":[65],"kernel,":[67],"manually":[69],"vectorize":[70],"by":[74,125],"factor":[76,127],"ranging":[77],"from":[78,137,145],"2":[79],"16.":[81],"evaluate":[83],"an":[87,92,103],"Intel":[88,104],"Xeon":[89],"16-core":[90],"CPU,":[91,143],"NVIDIA":[93],"P100":[94],"GPU,":[95],"Nallatech":[98],"385A":[99],"FPGA":[100],"card":[101],"featuring":[102],"Arria":[105],"10":[106],"GX1150":[107],"FPGA.":[108,132],"Compared":[109],"optimized":[112],"that":[114],"is":[115],"not":[116],"vectorized,":[117],"vectorization":[119,156],"can":[120],"improve":[121],"performance":[124,134],"16":[129],"The":[133,152],"improvement":[135],"ranges":[136],"1":[138],"11.4":[140],"1.02":[146],"9.3":[148],"GPU.":[151],"effectiveness":[153],"depends":[157],"work-group":[160],"size.":[161]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
