{"id":"https://openalex.org/W3007947130","doi":"https://doi.org/10.1109/bigdata47090.2019.9006494","title":"Exploration of OpenCL 2D Convolution Kernels on Intel FPGA, CPU, and GPU Platforms","display_name":"Exploration of OpenCL 2D Convolution Kernels on Intel FPGA, CPU, and GPU Platforms","publication_year":2019,"publication_date":"2019-12-01","ids":{"openalex":"https://openalex.org/W3007947130","doi":"https://doi.org/10.1109/bigdata47090.2019.9006494","mag":"3007947130"},"language":"en","primary_location":{"id":"doi:10.1109/bigdata47090.2019.9006494","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006494","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101790023","display_name":"Zheming Jin","orcid":"https://orcid.org/0000-0002-7197-780X"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zheming Jin","raw_affiliation_strings":["Leadership Computing Facility Argonne National Laboratory, Lemont, IL, USA"],"affiliations":[{"raw_affiliation_string":"Leadership Computing Facility Argonne National Laboratory, Lemont, IL, USA","institution_ids":["https://openalex.org/I1282105669"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5050512119","display_name":"Hal Finkel","orcid":"https://orcid.org/0000-0002-7551-7122"},"institutions":[{"id":"https://openalex.org/I1282105669","display_name":"Argonne National Laboratory","ror":"https://ror.org/05gvnxz63","country_code":"US","type":"facility","lineage":["https://openalex.org/I1282105669","https://openalex.org/I1330989302","https://openalex.org/I39565521","https://openalex.org/I40347166"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hal Finkel","raw_affiliation_strings":["Leadership Computing Facility Argonne National Laboratory, Lemont, IL, USA"],"affiliations":[{"raw_affiliation_string":"Leadership Computing Facility Argonne National Laboratory, Lemont, IL, USA","institution_ids":["https://openalex.org/I1282105669"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5101790023"],"corresponding_institution_ids":["https://openalex.org/I1282105669"],"apc_list":null,"apc_paid":null,"fwci":0.4815,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.64244717,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":93,"max":95},"biblio":{"volume":"29","issue":null,"first_page":"4460","last_page":"4465"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9994000196456909,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9988999962806702,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.9951000213623047,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8095260858535767},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.779396116733551},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.6807824373245239},{"id":"https://openalex.org/keywords/convolution","display_name":"Convolution (computer science)","score":0.6631648540496826},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5931493043899536},{"id":"https://openalex.org/keywords/coprocessor","display_name":"Coprocessor","score":0.41650390625},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.38555586338043213},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3272392153739929},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.08706575632095337},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.07146048545837402}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8095260858535767},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.779396116733551},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.6807824373245239},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.6631648540496826},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5931493043899536},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.41650390625},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.38555586338043213},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3272392153739929},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.08706575632095337},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.07146048545837402},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/bigdata47090.2019.9006494","is_oa":false,"landing_page_url":"https://doi.org/10.1109/bigdata47090.2019.9006494","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2019 IEEE International Conference on Big Data (Big Data)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W1984222112","https://openalex.org/W2068395868","https://openalex.org/W2135252045","https://openalex.org/W2475663704","https://openalex.org/W2523475814","https://openalex.org/W2746871167","https://openalex.org/W2765910623"],"related_works":["https://openalex.org/W2160342394","https://openalex.org/W2987062793","https://openalex.org/W2141458065","https://openalex.org/W2154961667","https://openalex.org/W2121150672","https://openalex.org/W2335592298","https://openalex.org/W1789336918","https://openalex.org/W2794923745","https://openalex.org/W2128283661","https://openalex.org/W1996820070"],"abstract_inverted_index":{"There":[0],"is":[1,134,140],"a":[2,20,71,80,149,213],"need":[3],"to":[4,84,91,125,155,165],"evaluate":[5,85],"the":[6,11,14,55,77,94,99,113,120,126,131,137,146,158,169,175,180,205,209,218,224,228],"resource":[7,87],"usage":[8,88],"and":[9,34,60,64,89,136,161,177,193,221],"optimize":[10],"performance":[12,90,147,176,211,230],"of":[13,27,58,73,76,179],"floating-point":[15,29],"2D":[16,47],"convolution":[17,48],"kernels":[18,114,181],"on":[19,98,102,182],"recent":[21],"FPGA":[22,107,206],"which":[23,130],"features":[24],"large":[25,37],"numbers":[26],"hardened":[28],"digital":[30],"signal":[31],"processing":[32],"blocks":[33],"an":[35,45,103,183,194],"increasingly":[36],"on-chip":[38],"memory.":[39],"In":[40],"this":[41],"paper,":[42],"we":[43,69,111,173],"presented":[44],"OpenCL":[46],"kernel":[49,78],"with":[50,79,115],"configurable":[51],"parameters":[52],"for":[53,157,168,212,231],"specifying":[54],"precision,":[56],"sizes":[57,118],"filter":[59,82,117,216,233],"block,":[61],"vectorization":[62,132],"width,":[63],"compute-unit":[65],"duplication":[66],"factor.":[67],"Then,":[68],"instantiated":[70],"set":[72],"specific":[74],"instances":[75],"fixed":[81],"size":[83,139],"their":[86],"narrow":[92],"down":[93],"exploration":[95,122],"space.":[96,123],"Based":[97],"evaluation":[100],"results":[101],"Intelo":[104],"Arria":[105],"10":[106],"using":[108],"high-level":[109],"synthesis,":[110],"evaluated":[112,174],"different":[116],"within":[119],"pruned":[121],"Compared":[124],"baseline":[127],"implementation":[128],"in":[129],"width":[133],"two":[135],"block":[138],"32$\\times$":[141],"32,":[142],"our":[143],"optimizations":[144],"improve":[145],"by":[148],"factor":[150],"ranging":[151],"from":[152,162],"1.":[153],"9X":[154],"3X":[156],"single-precision":[159],"kernels,":[160],"2.":[163],"2X":[164],"3.":[166],"37X":[167],"half-precision":[170],"kernels.":[171],"Furthermore,":[172],"power":[178],"Intel":[184],"<sup":[185,189,196],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[186,190,197],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">\u00ae</sup>":[187,191],"Xeon":[188],"CPU":[192],"Iris":[195],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">TM</sup>":[198],"Pro":[199],"integrated":[200],"GPU.":[201],"We":[202],"found":[203],"that":[204],"could":[207],"achieve":[208,227],"highest":[210,229],"9$\\times$":[214],"9":[215],"among":[217],"CPU,":[219],"GPU,":[220],"FPGA,":[222],"but":[223],"GPU":[225],"can":[226],"other":[232],"sizes.":[234]},"counts_by_year":[{"year":2021,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
