{"id":"https://openalex.org/W3028780355","doi":"https://doi.org/10.1145/3378678.3391880","title":"Programming tensor cores from an image processing DSL","display_name":"Programming tensor cores from an image processing DSL","publication_year":2020,"publication_date":"2020-05-25","ids":{"openalex":"https://openalex.org/W3028780355","doi":"https://doi.org/10.1145/3378678.3391880","mag":"3028780355"},"language":"en","primary_location":{"id":"doi:10.1145/3378678.3391880","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3378678.3391880","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 23th International Workshop on Software and Compilers for Embedded Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5062737440","display_name":"Savvas Sioutas","orcid":null},"institutions":[{"id":"https://openalex.org/I83019370","display_name":"Eindhoven University of Technology","ror":"https://ror.org/02c2kyt77","country_code":"NL","type":"education","lineage":["https://openalex.org/I83019370"]}],"countries":["NL"],"is_corresponding":true,"raw_author_name":"Savvas Sioutas","raw_affiliation_strings":["Eindhoven University of Technology, Eindhoven, Netherlands"],"affiliations":[{"raw_affiliation_string":"Eindhoven University of Technology, Eindhoven, Netherlands","institution_ids":["https://openalex.org/I83019370"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5032436846","display_name":"Sander Stuijk","orcid":"https://orcid.org/0000-0002-2518-6847"},"institutions":[{"id":"https://openalex.org/I83019370","display_name":"Eindhoven University of Technology","ror":"https://ror.org/02c2kyt77","country_code":"NL","type":"education","lineage":["https://openalex.org/I83019370"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Sander Stuijk","raw_affiliation_strings":["Eindhoven University of Technology, Eindhoven, Netherlands"],"affiliations":[{"raw_affiliation_string":"Eindhoven University of Technology, Eindhoven, Netherlands","institution_ids":["https://openalex.org/I83019370"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026760188","display_name":"Twan Basten","orcid":"https://orcid.org/0000-0002-2274-7274"},"institutions":[{"id":"https://openalex.org/I83019370","display_name":"Eindhoven University of Technology","ror":"https://ror.org/02c2kyt77","country_code":"NL","type":"education","lineage":["https://openalex.org/I83019370"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Twan Basten","raw_affiliation_strings":["Eindhoven University of Technology, Eindhoven, Netherlands"],"affiliations":[{"raw_affiliation_string":"Eindhoven University of Technology, Eindhoven, Netherlands","institution_ids":["https://openalex.org/I83019370"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112241649","display_name":"Lou Somers","orcid":null},"institutions":[{"id":"https://openalex.org/I83019370","display_name":"Eindhoven University of Technology","ror":"https://ror.org/02c2kyt77","country_code":"NL","type":"education","lineage":["https://openalex.org/I83019370"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Lou Somers","raw_affiliation_strings":["Eindhoven University of Technology, Eindhoven, Netherlands"],"affiliations":[{"raw_affiliation_string":"Eindhoven University of Technology, Eindhoven, Netherlands","institution_ids":["https://openalex.org/I83019370"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5081768631","display_name":"Henk Corporaal","orcid":"https://orcid.org/0000-0003-4506-5732"},"institutions":[{"id":"https://openalex.org/I83019370","display_name":"Eindhoven University of Technology","ror":"https://ror.org/02c2kyt77","country_code":"NL","type":"education","lineage":["https://openalex.org/I83019370"]}],"countries":["NL"],"is_corresponding":false,"raw_author_name":"Henk Corporaal","raw_affiliation_strings":["Eindhoven University of Technology, Eindhoven, Netherlands"],"affiliations":[{"raw_affiliation_string":"Eindhoven University of Technology, Eindhoven, Netherlands","institution_ids":["https://openalex.org/I83019370"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5062737440"],"corresponding_institution_ids":["https://openalex.org/I83019370"],"apc_list":null,"apc_paid":null,"fwci":1.6495,"has_fulltext":false,"cited_by_count":8,"citation_normalized_percentile":{"value":0.83377978,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"36","last_page":"41"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.996399998664856,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9948999881744385,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/intrinsics","display_name":"Intrinsics","score":0.8422541618347168},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8301844596862793},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.7784707546234131},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7591502666473389},{"id":"https://openalex.org/keywords/digital-subscriber-line","display_name":"Digital subscriber line","score":0.7117404937744141},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6537782549858093},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5980148911476135},{"id":"https://openalex.org/keywords/stencil","display_name":"Stencil","score":0.511025071144104},{"id":"https://openalex.org/keywords/code-generation","display_name":"Code generation","score":0.471781462430954},{"id":"https://openalex.org/keywords/microarchitecture","display_name":"Microarchitecture","score":0.4416589140892029},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.42694517970085144},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.20506694912910461},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.17518046498298645},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.12766686081886292}],"concepts":[{"id":"https://openalex.org/C2908650547","wikidata":"https://www.wikidata.org/wiki/Q20999234","display_name":"Intrinsics","level":2,"score":0.8422541618347168},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8301844596862793},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.7784707546234131},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7591502666473389},{"id":"https://openalex.org/C201374245","wikidata":"https://www.wikidata.org/wiki/Q104534","display_name":"Digital subscriber line","level":2,"score":0.7117404937744141},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6537782549858093},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5980148911476135},{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.511025071144104},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.471781462430954},{"id":"https://openalex.org/C107598950","wikidata":"https://www.wikidata.org/wiki/Q259864","display_name":"Microarchitecture","level":2,"score":0.4416589140892029},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.42694517970085144},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.20506694912910461},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.17518046498298645},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.12766686081886292},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.0}],"mesh":[],"locations_count":6,"locations":[{"id":"doi:10.1145/3378678.3391880","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3378678.3391880","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 23th International Workshop on Software and Compilers for Embedded Systems","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.tue.nl:openaire_cris_publications/a51d6944-0ae0-435f-8501-fde763b20000","is_oa":false,"landing_page_url":"https://research.tue.nl/en/publications/a51d6944-0ae0-435f-8501-fde763b20000","pdf_url":null,"source":{"id":"https://openalex.org/S4406922641","display_name":"TU/e Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Sioutas, S, Stuijk, S, Basten, T, Somers, L & Corporaal, H 2020, Programming tensor cores from an image processing DSL. in S Stuijk (ed.), Proceedings of the 23rd International Workshop on Software and Compilers for Embedded Systems, SCOPES 2020. Association for Computing Machinery, Inc., pp. 36-41, 23rd International Workshop on Software and Compilers for Embedded Systems (SCOPES 2020), St. Goar, Germany, 25/05/20. https://doi.org/10.1145/3378678.3391880","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:oai-pmh.tno.nl:51431","is_oa":false,"landing_page_url":"https://resolver.tno.nl/uuid:d4523a67-6acb-4fcf-8db8-2466a66fbabe","pdf_url":null,"source":{"id":"https://openalex.org/S7407055233","display_name":"TNO Repository","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"info:eu-repo/semantics/conferencePaper"},{"id":"pmh:oai:pure.tue.nl:publications/a51d6944-0ae0-435f-8501-fde763b20000","is_oa":false,"landing_page_url":"http://www.scopus.com/inward/record.url?scp=85086725541&partnerID=8YFLogxK","pdf_url":null,"source":{"id":"https://openalex.org/S4406922641","display_name":"TU/e Research Portal","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Sioutas, S, Stuijk, S, Basten, T, Somers, L & Corporaal, H 2020, Programming tensor cores from an image processing DSL. in S Stuijk (ed.), Proceedings of the 23rd International Workshop on Software and Compilers for Embedded Systems, SCOPES 2020. Association for Computing Machinery, Inc., pp. 36-41, 23rd International Workshop on Software and Compilers for Embedded Systems (SCOPES 2020), St. Goar, Germany, 25/05/20. https://doi.org/10.1145/3378678.3391880","raw_type":"info:eu-repo/semantics/publishedVersion"},{"id":"pmh:oai:tudelft.nl:uuid:d4523a67-6acb-4fcf-8db8-2466a66fbabe","is_oa":false,"landing_page_url":"http://resolver.tudelft.nl/uuid:d4523a67-6acb-4fcf-8db8-2466a66fbabe","pdf_url":null,"source":{"id":"https://openalex.org/S4306402238","display_name":"Repository hosted by TU Delft Library (TU Delft)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I98358874","host_organization_name":"Delft University of Technology","host_organization_lineage":["https://openalex.org/I98358874"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proceedings of the 23rd International Workshop on Software and Compilers for Embedded Systems, SCOPES 2020, 25 May 2020, 36-41","raw_type":"article"},{"id":"pmh:tue:oai:pure.tue.nl:publications/a51d6944-0ae0-435f-8501-fde763b20000","is_oa":false,"landing_page_url":"https://research.tue.nl/nl/publications/a51d6944-0ae0-435f-8501-fde763b20000","pdf_url":null,"source":{"id":"https://openalex.org/S4306401843","display_name":"Data Archiving and Networked Services (DANS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I1322597698","host_organization_name":"Royal Netherlands Academy of Arts and Sciences","host_organization_lineage":["https://openalex.org/I1322597698"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Proceedings of the 23rd International Workshop on Software and Compilers for Embedded Systems, SCOPES 2020, 36 - 41","raw_type":"info:eu-repo/semantics/conferencepaper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W1667652561","https://openalex.org/W2055312318","https://openalex.org/W2075745057","https://openalex.org/W2133352531","https://openalex.org/W2153185479","https://openalex.org/W2544002786","https://openalex.org/W2557283755","https://openalex.org/W2582073145","https://openalex.org/W2606722458"],"related_works":["https://openalex.org/W3105129168","https://openalex.org/W2804920739","https://openalex.org/W1987577928","https://openalex.org/W4316371992","https://openalex.org/W2186216222","https://openalex.org/W4243297508","https://openalex.org/W1987692736","https://openalex.org/W307712065","https://openalex.org/W4321496380","https://openalex.org/W4205911102"],"abstract_inverted_index":{"Tensor":[0],"Cores":[1],"(TCUs)":[2],"are":[3],"specialized":[4],"units":[5,29,66],"first":[6],"introduced":[7],"by":[8],"NVIDIA":[9,74,136],"in":[10,14,96],"the":[11,47,55,61,105,109,135],"Volta":[12],"microarchitecture":[13],"order":[15,97],"to":[16,32,63,98,101,122],"accelerate":[17],"matrix":[18],"multiplications":[19],"for":[20,39,46,70,104,139],"deep":[21],"learning":[22],"and":[23,58,112,143],"linear":[24],"algebra":[25],"workloads.":[26],"While":[27],"these":[28,65],"have":[30],"proved":[31],"be":[33,99],"capable":[34],"of":[35,134,146],"providing":[36],"significant":[37],"speedups":[38],"specific":[40],"applications,":[41],"their":[42],"programmability":[43],"remains":[44,131],"difficult":[45],"average":[48],"user.":[49],"In":[50],"this":[51,77],"paper,":[52],"we":[53,79],"extend":[54],"Halide":[56,94,123],"DSL":[57],"compiler":[59],"with":[60,86,150],"ability":[62],"utilize":[64],"when":[67],"generating":[68],"code":[69,103,111],"a":[71,81,93],"CUDA":[72,148],"based":[73],"GPGPU.":[75],"To":[76],"end,":[78],"introduce":[80],"new":[82],"scheduling":[83],"directive":[84],"along":[85],"custom":[87],"lowering":[88],"passes":[89],"that":[90,114],"automatically":[91],"transform":[92],"AST":[95],"able":[100],"generate":[102],"TCUs.":[106],"We":[107],"evaluate":[108],"generated":[110],"show":[113],"it":[115,130],"can":[116],"achieve":[117],"over":[118],"5X":[119],"speedup":[120],"compared":[121],"manual":[124,147],"schedules":[125],"without":[126],"TCU":[127],"support,":[128],"while":[129],"within":[132,144],"20%":[133],"cuBLAS":[137],"implementations":[138,149],"mixed":[140],"precision":[141],"GEMM":[142],"10%":[145],"WMMA":[151],"intrinsics.":[152]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":4}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
