{"id":"https://openalex.org/W4387445624","doi":"https://doi.org/10.1109/fdl59689.2023.10272088","title":"Hybrid PTX Analysis for GPU accelerated CNN inferencing aiding Computer Architecture Design","display_name":"Hybrid PTX Analysis for GPU accelerated CNN inferencing aiding Computer Architecture Design","publication_year":2023,"publication_date":"2023-09-13","ids":{"openalex":"https://openalex.org/W4387445624","doi":"https://doi.org/10.1109/fdl59689.2023.10272088"},"language":"en","primary_location":{"id":"doi:10.1109/fdl59689.2023.10272088","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/fdl59689.2023.10272088","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 Forum on Specification &amp; Design Languages (FDL)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5088510932","display_name":"Christopher A. Metz","orcid":"https://orcid.org/0000-0003-4975-9483"},"institutions":[{"id":"https://openalex.org/I180437899","display_name":"University of Bremen","ror":"https://ror.org/04ers2y35","country_code":"DE","type":"education","lineage":["https://openalex.org/I180437899"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Christopher A. Metz","raw_affiliation_strings":["Institute of Computer Science, University of Bremen,Bremen,Germany","Institute of Computer Science, University of Bremen, Bremen, Germany"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Bremen,Bremen,Germany","institution_ids":["https://openalex.org/I180437899"]},{"raw_affiliation_string":"Institute of Computer Science, University of Bremen, Bremen, Germany","institution_ids":["https://openalex.org/I180437899"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043224413","display_name":"Christina Plump","orcid":"https://orcid.org/0000-0003-0392-6397"},"institutions":[{"id":"https://openalex.org/I33256026","display_name":"German Research Centre for Artificial Intelligence","ror":"https://ror.org/01ayc5b57","country_code":"DE","type":"funder","lineage":["https://openalex.org/I33256026"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Christina Plump","raw_affiliation_strings":["Cyber-Physical Systems, DFKI GmbH,Bremen,Germany","Cyber-Physical Systems, DFKI GmbH, Bremen, Germany"],"affiliations":[{"raw_affiliation_string":"Cyber-Physical Systems, DFKI GmbH,Bremen,Germany","institution_ids":["https://openalex.org/I33256026"]},{"raw_affiliation_string":"Cyber-Physical Systems, DFKI GmbH, Bremen, Germany","institution_ids":["https://openalex.org/I33256026"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074604321","display_name":"Bernhard Berger","orcid":"https://orcid.org/0000-0001-6093-9229"},"institutions":[{"id":"https://openalex.org/I159176309","display_name":"Universit\u00e4t Hamburg","ror":"https://ror.org/00g30e956","country_code":"DE","type":"education","lineage":["https://openalex.org/I159176309"]},{"id":"https://openalex.org/I884043246","display_name":"Hamburg University of Technology","ror":"https://ror.org/04bs1pb34","country_code":"DE","type":"education","lineage":["https://openalex.org/I884043246"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Bernhard J. Berger","raw_affiliation_strings":["Institute of Embedded Systems, Hamburg University of Technology,Hamburg,Germany","Institute of Embedded Systems, Hamburg University of Technology, Hamburg, Germany"],"affiliations":[{"raw_affiliation_string":"Institute of Embedded Systems, Hamburg University of Technology,Hamburg,Germany","institution_ids":["https://openalex.org/I159176309","https://openalex.org/I884043246"]},{"raw_affiliation_string":"Institute of Embedded Systems, Hamburg University of Technology, Hamburg, Germany","institution_ids":["https://openalex.org/I159176309","https://openalex.org/I884043246"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071742136","display_name":"Rolf Drechsler","orcid":"https://orcid.org/0000-0002-9872-1740"},"institutions":[{"id":"https://openalex.org/I180437899","display_name":"University of Bremen","ror":"https://ror.org/04ers2y35","country_code":"DE","type":"education","lineage":["https://openalex.org/I180437899"]},{"id":"https://openalex.org/I33256026","display_name":"German Research Centre for Artificial Intelligence","ror":"https://ror.org/01ayc5b57","country_code":"DE","type":"funder","lineage":["https://openalex.org/I33256026"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Rolf Drechsler","raw_affiliation_strings":["Institute of Computer Science, University of Bremen,Bremen,Germany","Institute of Computer Science, University of Bremen, Bremen, Germany","Cyber-Physical Systems, DFKI GmbH, Bremen, Germany"],"affiliations":[{"raw_affiliation_string":"Institute of Computer Science, University of Bremen,Bremen,Germany","institution_ids":["https://openalex.org/I180437899"]},{"raw_affiliation_string":"Institute of Computer Science, University of Bremen, Bremen, Germany","institution_ids":["https://openalex.org/I180437899"]},{"raw_affiliation_string":"Cyber-Physical Systems, DFKI GmbH, Bremen, Germany","institution_ids":["https://openalex.org/I33256026"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5088510932"],"corresponding_institution_ids":["https://openalex.org/I180437899"],"apc_list":null,"apc_paid":null,"fwci":0.3031,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.51966463,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"abs 1704 4861","issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8845726847648621},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.7881911993026733},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7058113813400269},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.685119092464447},{"id":"https://openalex.org/keywords/profiling","display_name":"Profiling (computer programming)","score":0.6336070895195007},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.6247115135192871},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.5439233779907227},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.5206805467605591},{"id":"https://openalex.org/keywords/source-code","display_name":"Source code","score":0.4438937306404114},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.4403691291809082},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3486135005950928},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.2941763997077942}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8845726847648621},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.7881911993026733},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7058113813400269},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.685119092464447},{"id":"https://openalex.org/C187191949","wikidata":"https://www.wikidata.org/wiki/Q1138496","display_name":"Profiling (computer programming)","level":2,"score":0.6336070895195007},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.6247115135192871},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.5439233779907227},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.5206805467605591},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.4438937306404114},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.4403691291809082},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3486135005950928},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2941763997077942},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.0},{"id":"https://openalex.org/C13280743","wikidata":"https://www.wikidata.org/wiki/Q131089","display_name":"Geodesy","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/fdl59689.2023.10272088","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/fdl59689.2023.10272088","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 Forum on Specification &amp; Design Languages (FDL)","raw_type":"proceedings-article"},{"id":"pmh:oai:tore.tuhh.de:11420/44189","is_oa":false,"landing_page_url":"https://hdl.handle.net/11420/44189","pdf_url":null,"source":{"id":"https://openalex.org/S4306401751","display_name":"tub.dok (Hamburg University of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I884043246","host_organization_name":"Hamburg University of Technology","host_organization_lineage":["https://openalex.org/I884043246"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Conference Paper"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5299999713897705,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W645173616","https://openalex.org/W1979527452","https://openalex.org/W2038666141","https://openalex.org/W2043555680","https://openalex.org/W2076090418","https://openalex.org/W2098290747","https://openalex.org/W2108598243","https://openalex.org/W2149127686","https://openalex.org/W2149693551","https://openalex.org/W2151195685","https://openalex.org/W2194775991","https://openalex.org/W2522820499","https://openalex.org/W2618530766","https://openalex.org/W2622263826","https://openalex.org/W2763068163","https://openalex.org/W2769856846","https://openalex.org/W2935389012","https://openalex.org/W2963446712","https://openalex.org/W2964081807","https://openalex.org/W2984189761","https://openalex.org/W2997701623","https://openalex.org/W3006997840","https://openalex.org/W3013297713","https://openalex.org/W3015338905","https://openalex.org/W3203257953","https://openalex.org/W3204255478","https://openalex.org/W4231040899","https://openalex.org/W4285503988","https://openalex.org/W4285504014","https://openalex.org/W4297775537","https://openalex.org/W4297822300","https://openalex.org/W6687483927","https://openalex.org/W6737664043","https://openalex.org/W6739622702","https://openalex.org/W6741414320","https://openalex.org/W6746514494","https://openalex.org/W6761129924","https://openalex.org/W6843773979"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2080146221","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2792081825"],"abstract_inverted_index":{"General-Purpose":[0],"Computation":[1],"on":[2,123,144,218],"Graphics":[3],"Processing":[4],"Units":[5],"(GPGPUs)":[6],"are":[7,50],"becoming":[8],"crucial":[9],"in":[10,72,178,236],"accelerating":[11],"computing":[12],"capacity.":[13],"Due":[14],"to":[15,29,33,69,126,168,188,191,230],"the":[16,92,128,147,152,192,216],"massive":[17],"parallelism":[18,59],"capabilities":[19],"of":[20,27,74,130,164,186,228],"GPUs,":[21],"they":[22],"can":[23,198],"achieve":[24],"impressive":[25],"speedups":[26,185],"up":[28,187],"32":[30],"times":[31],"compared":[32,167,190],"common":[34],"CPUs.":[35,62],"However,":[36],"writing":[37],"highly":[38],"parallel":[39],"code":[40,75,108,166],"and":[41,58,64,86,110,134,150,233,239],"utilizing":[42],"a":[43,88,98,114,209],"GPU":[44],"is":[45],"challenging":[46],"for":[47,91],"programmers.":[48],"Developers":[49],"facing":[51],"new":[52],"challenges":[53],"since":[54],"GPUs":[55],"handle":[56],"threads":[57],"differently":[60],"from":[61],"Academia":[63],"industry":[65],"proposed":[66],"several":[67],"profilers":[68,78],"support":[70],"developers":[71,232],"terms":[73],"optimization.":[76],"These":[77],"often":[79],"require":[80],"an":[81,219,225],"actual":[82,220],"device":[83],"(e.g.,":[84,205],"GPU)":[85],"take":[87],"long":[89],"time":[90,212],"profiling":[93,215],"process.":[94],"We":[95],"propose":[96],"HyPA,":[97],"hybrid":[99],"Parallel":[100],"Thread":[101],"Execution":[102],"(PTX)":[103],"Analyzer":[104],"that":[105,118,121,142],"inspects":[106],"PTX":[107,132,154,165],"statically":[109],"dynamically.":[111],"HyPA":[112,183,229],"implements":[113],"partly":[115],"functional":[116,158],"emulator":[117,159],"executes":[119,138],"instructions":[120,133],"rely":[122],"runtime":[124],"dependencies":[125],"count":[127],"number":[129],"executed":[131],"divergent":[135],"branches.":[136],"HyPa":[137],"compiled":[139],"kernels\u2014the":[140],"programs":[141],"run":[143],"GPUs\u2014generated":[145],"by":[146,208,214],"CUDA":[148],"compiler":[149],"supports":[151],"full":[153],"7.7":[155],"specification.":[156],"Our":[157],"allows":[160],"significantly":[161],"faster":[162,210],"analysis":[163,204],"standard":[169],"profilers.":[170],"In":[171],"our":[172,196],"evaluation,":[173],"we":[174,223],"quantify":[175],"this":[176],"increase":[177],"performance":[179,200],"through":[180],"benchmark":[181],"runs.":[182],"achieved":[184],"536%":[189],"nvprof":[193],"profiler.":[194],"Moreover,":[195],"approach":[197],"gather":[199],"metrics":[201],"beyond":[202],"static":[203],"branch":[206],"efficiency)":[207],"execution":[211],"than":[213],"application":[217],"device.":[221],"Finally,":[222],"provide":[224],"open-source":[226],"implementation":[227],"help":[231],"system":[234],"designers":[235],"further":[237],"research":[238],"development.":[240]},"counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
