{"id":"https://openalex.org/W3133174490","doi":"https://doi.org/10.1145/3446382.3448606","title":"Minimizing GPU Kernel Launch Overhead in Deep Learning Inference on Mobile GPUs","display_name":"Minimizing GPU Kernel Launch Overhead in Deep Learning Inference on Mobile GPUs","publication_year":2021,"publication_date":"2021-02-20","ids":{"openalex":"https://openalex.org/W3133174490","doi":"https://doi.org/10.1145/3446382.3448606","mag":"3133174490"},"language":"en","primary_location":{"id":"doi:10.1145/3446382.3448606","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3446382.3448606","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd International Workshop on Mobile Computing Systems and Applications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100429096","display_name":"Sumin Kim","orcid":"https://orcid.org/0000-0003-1813-1706"},"institutions":[{"id":"https://openalex.org/I124633538","display_name":"University of Seoul","ror":"https://ror.org/05en5nh73","country_code":"KR","type":"education","lineage":["https://openalex.org/I124633538"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Sumin Kim","raw_affiliation_strings":["University of Seoul"],"affiliations":[{"raw_affiliation_string":"University of Seoul","institution_ids":["https://openalex.org/I124633538"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102017960","display_name":"Seunghwan Oh","orcid":"https://orcid.org/0000-0002-5794-9829"},"institutions":[{"id":"https://openalex.org/I124633538","display_name":"University of Seoul","ror":"https://ror.org/05en5nh73","country_code":"KR","type":"education","lineage":["https://openalex.org/I124633538"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Seunghwan Oh","raw_affiliation_strings":["University of Seoul"],"affiliations":[{"raw_affiliation_string":"University of Seoul","institution_ids":["https://openalex.org/I124633538"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5029634385","display_name":"Youngmin Yi","orcid":"https://orcid.org/0000-0001-9802-2109"},"institutions":[{"id":"https://openalex.org/I124633538","display_name":"University of Seoul","ror":"https://ror.org/05en5nh73","country_code":"KR","type":"education","lineage":["https://openalex.org/I124633538"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Youngmin Yi","raw_affiliation_strings":["University of Seoul"],"affiliations":[{"raw_affiliation_string":"University of Seoul","institution_ids":["https://openalex.org/I124633538"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5100429096"],"corresponding_institution_ids":["https://openalex.org/I124633538"],"apc_list":null,"apc_paid":null,"fwci":1.5371,"has_fulltext":false,"cited_by_count":19,"citation_normalized_percentile":{"value":0.84576797,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"57","last_page":"63"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9918000102043152,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11992","display_name":"CCD and CMOS Imaging Sensors","score":0.9901999831199646,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8242852687835693},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.7678889632225037},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7603061199188232},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.7468976378440857},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.669536828994751},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5155191421508789},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.48471736907958984},{"id":"https://openalex.org/keywords/mobile-device","display_name":"Mobile device","score":0.4501681923866272},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4321726858615875},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.3971296548843384},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.3528951406478882},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.34526997804641724},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.12462180852890015}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8242852687835693},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.7678889632225037},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7603061199188232},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.7468976378440857},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.669536828994751},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5155191421508789},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.48471736907958984},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.4501681923866272},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4321726858615875},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3971296548843384},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3528951406478882},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.34526997804641724},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.12462180852890015},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.0},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3446382.3448606","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3446382.3448606","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 22nd International Workshop on Mobile Computing Systems and Applications","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2279098554","https://openalex.org/W2399715892","https://openalex.org/W2612445135","https://openalex.org/W2895432151","https://openalex.org/W2963122961","https://openalex.org/W2971002981","https://openalex.org/W3032392568","https://openalex.org/W3105888187"],"related_works":["https://openalex.org/W1963859303","https://openalex.org/W2364044215","https://openalex.org/W2389600408","https://openalex.org/W240129890","https://openalex.org/W3048701459","https://openalex.org/W2149078538","https://openalex.org/W2370314112","https://openalex.org/W1912958759","https://openalex.org/W2792081825","https://openalex.org/W2893308117"],"abstract_inverted_index":{"The":[0],"need":[1],"for":[2,76,105],"on-device":[3],"real-time":[4,46],"Deep":[5],"Learning":[6],"inference":[7,47,58,133],"is":[8,29],"increasing":[9],"as":[10,17],"deep":[11],"learning":[12],"on":[13,27,65,144],"edge":[14],"devices":[15],"such":[16],"smartphones":[18],"and":[19,70,127,140,148],"robots":[20],"are":[21,37],"becoming":[22],"popular.":[23],"Although":[24],"hardware":[25],"acceleration":[26],"NPU":[28],"attracting":[30],"more":[31],"attention,":[32],"the":[33,42,57,61,66,77,88,102,106,113,132],"recent":[34,67],"mobile":[35,68],"GPUs":[36,69],"fast":[38],"enough":[39],"to":[40,44,112,125],"provide":[41],"potential":[43],"achieve":[45,123],"of":[48,60,129,134],"many":[49],"CNNs.":[50],"In":[51],"this":[52],"paper,":[53],"we":[54,82,94,121],"first":[55],"analyze":[56],"time":[59],"widely":[62],"used":[63],"CNNs":[64,136],"reveal":[71],"that":[72,86,99,109,120],"significant":[73],"overhead":[74],"exists":[75],"GPU":[78,147],"kernel":[79,89,107],"launches.":[80],"Then,":[81],"identify":[83],"various":[84,135],"factors":[85],"cause":[87],"launch":[90],"overhead,":[91],"from":[92],"which":[93],"formulate":[95],"a":[96],"performance":[97],"model":[98],"can":[100,110],"predict":[101],"optimal":[103],"period":[104],"flush":[108],"lead":[111],"minimal":[114],"overhead.":[115],"Our":[116],"experimental":[117],"results":[118],"show":[119],"could":[122],"up":[124],"64%":[126],"31%":[128],"speedups":[130],"in":[131],"with":[137],"TensorFlow":[138],"Lite":[139],"ARM":[141],"Compute":[142],"Library":[143],"Adreno":[145],"650":[146],"Mali":[149],"G76":[150],"GPU.":[151]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":3},{"year":2022,"cited_by_count":10},{"year":2021,"cited_by_count":3}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
