{"id":"https://openalex.org/W2315868086","doi":"https://doi.org/10.1109/hpca.2016.7446062","title":"Warped-preexecution: A GPU pre-execution approach for improving latency hiding","display_name":"Warped-preexecution: A GPU pre-execution approach for improving latency hiding","publication_year":2016,"publication_date":"2016-03-01","ids":{"openalex":"https://openalex.org/W2315868086","doi":"https://doi.org/10.1109/hpca.2016.7446062","mag":"2315868086"},"language":"en","primary_location":{"id":"doi:10.1109/hpca.2016.7446062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca.2016.7446062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080401771","display_name":"Keun\u2010Soo Kim","orcid":"https://orcid.org/0000-0002-4901-6156"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":true,"raw_author_name":"Keunsoo Kim","raw_affiliation_strings":["School of Electrical and Electronic Engineering, Yonsei University"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Electronic Engineering, Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028332065","display_name":"Phil Lee","orcid":"https://orcid.org/0000-0002-2087-8887"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Sangpil Lee","raw_affiliation_strings":["School of Electrical and Electronic Engineering, Yonsei University"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Electronic Engineering, Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5040253395","display_name":"Myung Kuk Yoon","orcid":"https://orcid.org/0000-0002-9332-0251"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Myung Kuk Yoon","raw_affiliation_strings":["School of Electrical and Electronic Engineering, Yonsei University"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Electronic Engineering, Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055061586","display_name":"Gunjae Koo","orcid":"https://orcid.org/0000-0003-1706-6850"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Gunjae Koo","raw_affiliation_strings":["University of Southern California, Los Angeles, CA, US"],"affiliations":[{"raw_affiliation_string":"University of Southern California, Los Angeles, CA, US","institution_ids":["https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017913155","display_name":"Won Woo Ro","orcid":"https://orcid.org/0000-0001-5390-6445"},"institutions":[{"id":"https://openalex.org/I193775966","display_name":"Yonsei University","ror":"https://ror.org/01wjejq96","country_code":"KR","type":"education","lineage":["https://openalex.org/I193775966"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Won Woo Ro","raw_affiliation_strings":["School of Electrical and Electronic Engineering, Yonsei University"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Electronic Engineering, Yonsei University","institution_ids":["https://openalex.org/I193775966"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5018033573","display_name":"Murali Annavaram","orcid":"https://orcid.org/0000-0002-4633-6867"},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Murali Annavaram","raw_affiliation_strings":["Ming Hsieh Department of Electrical Engineering, University of Southern California"],"affiliations":[{"raw_affiliation_string":"Ming Hsieh Department of Electrical Engineering, University of Southern California","institution_ids":["https://openalex.org/I1174212"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5080401771"],"corresponding_institution_ids":["https://openalex.org/I193775966"],"apc_list":null,"apc_paid":null,"fwci":8.38815246,"has_fulltext":false,"cited_by_count":50,"citation_normalized_percentile":{"value":0.98556362,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":93,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"163","last_page":"175"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8952711820602417},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.8276181817054749},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6727036237716675},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6685255169868469},{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.5438935160636902},{"id":"https://openalex.org/keywords/concurrency","display_name":"Concurrency","score":0.5360301733016968},{"id":"https://openalex.org/keywords/register-file","display_name":"Register file","score":0.5018110275268555},{"id":"https://openalex.org/keywords/cache-coherence","display_name":"Cache coherence","score":0.49948596954345703},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.4876317083835602},{"id":"https://openalex.org/keywords/multithreading","display_name":"Multithreading","score":0.474817156791687},{"id":"https://openalex.org/keywords/mode","display_name":"Mode (computer interface)","score":0.43209031224250793},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.36923113465309143},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.36739641427993774},{"id":"https://openalex.org/keywords/instruction-set","display_name":"Instruction set","score":0.3520740866661072},{"id":"https://openalex.org/keywords/cpu-cache","display_name":"CPU cache","score":0.3508639335632324},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.13670819997787476}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8952711820602417},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.8276181817054749},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6727036237716675},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6685255169868469},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.5438935160636902},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.5360301733016968},{"id":"https://openalex.org/C117280010","wikidata":"https://www.wikidata.org/wiki/Q180944","display_name":"Register file","level":3,"score":0.5018110275268555},{"id":"https://openalex.org/C141917322","wikidata":"https://www.wikidata.org/wiki/Q1025017","display_name":"Cache coherence","level":5,"score":0.49948596954345703},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.4876317083835602},{"id":"https://openalex.org/C201410400","wikidata":"https://www.wikidata.org/wiki/Q1064412","display_name":"Multithreading","level":3,"score":0.474817156791687},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.43209031224250793},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.36923113465309143},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.36739641427993774},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.3520740866661072},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3508639335632324},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.13670819997787476},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/hpca.2016.7446062","is_oa":false,"landing_page_url":"https://doi.org/10.1109/hpca.2016.7446062","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2016 IEEE International Symposium on High Performance Computer Architecture (HPCA)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4399999976158142,"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":53,"referenced_works":["https://openalex.org/W1970815868","https://openalex.org/W1975371390","https://openalex.org/W1979527452","https://openalex.org/W1982123268","https://openalex.org/W1991518265","https://openalex.org/W2000335122","https://openalex.org/W2018150881","https://openalex.org/W2023925637","https://openalex.org/W2031502045","https://openalex.org/W2034639175","https://openalex.org/W2038178246","https://openalex.org/W2047060659","https://openalex.org/W2048441570","https://openalex.org/W2053744175","https://openalex.org/W2059301531","https://openalex.org/W2079038734","https://openalex.org/W2079248286","https://openalex.org/W2080592089","https://openalex.org/W2081583983","https://openalex.org/W2090584832","https://openalex.org/W2093043622","https://openalex.org/W2096661534","https://openalex.org/W2108039095","https://openalex.org/W2110195531","https://openalex.org/W2118859527","https://openalex.org/W2123608497","https://openalex.org/W2125979435","https://openalex.org/W2139427807","https://openalex.org/W2142119745","https://openalex.org/W2142444503","https://openalex.org/W2149379863","https://openalex.org/W2152956697","https://openalex.org/W2155356535","https://openalex.org/W2156831150","https://openalex.org/W2160428323","https://openalex.org/W2161864047","https://openalex.org/W2165423885","https://openalex.org/W2166918318","https://openalex.org/W2167100385","https://openalex.org/W2168214303","https://openalex.org/W2168872232","https://openalex.org/W2169880332","https://openalex.org/W2238992335","https://openalex.org/W2273440736","https://openalex.org/W3137551601","https://openalex.org/W3138340923","https://openalex.org/W4236345830","https://openalex.org/W4236355366","https://openalex.org/W4242976792","https://openalex.org/W4243665729","https://openalex.org/W6678660376","https://openalex.org/W6694513646","https://openalex.org/W6792253866"],"related_works":["https://openalex.org/W2150450196","https://openalex.org/W1558769186","https://openalex.org/W4239584669","https://openalex.org/W2045555750","https://openalex.org/W2103261828","https://openalex.org/W4250432526","https://openalex.org/W1521414776","https://openalex.org/W2286165368","https://openalex.org/W2101536355","https://openalex.org/W2009783759"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"a":[3,16,87,92,100,153,163,191],"pre-execution":[4],"approach":[5],"for":[6,21,213],"improving":[7],"GPU":[8],"performance,":[9],"called":[10],"P-mode":[11,136,160,168,187],"(pre-execution":[12],"mode).":[13],"GPUs":[14],"utilize":[15],"number":[17],"of":[18,25,39,50],"concurrent":[19],"threads":[20,64,77],"hiding":[22],"processing":[23],"delay":[24],"operations.":[26],"However,":[27],"certain":[28],"long-latency":[29,93],"operations":[30],"such":[31],"as":[32],"off-chip":[33],"memory":[34,72,203,214],"accesses":[35],"often":[36],"take":[37],"hundreds":[38],"cycles":[40],"and":[41,53,105,132],"hence":[42],"leads":[43],"to":[44,70,103,109,141,151,157,169,200],"stalls":[45],"even":[46],"in":[47,186],"the":[48,118,147,159,179,197],"presence":[49],"thread":[51,55],"concurrency":[52],"fast":[54],"switching":[56],"capability.":[57],"It":[58],"is":[59,89,115,165,188],"unclear":[60],"if":[61],"adding":[62,75],"more":[63,76],"can":[65],"improve":[66],"latency":[67,120],"tolerance":[68],"due":[69],"increased":[71],"contention.":[73],"Further,":[74],"increases":[78],"on-chip":[79],"storage":[80],"demands.":[81],"Instead":[82],"we":[83],"propose":[84],"that":[85,114],"when":[86],"warp":[88,101,164],"stalled":[90],"on":[91,117],"operation":[94,185],"it":[95,173],"enters":[96],"P-mode.":[97],"In":[98],"P-mode,":[99],"continues":[102],"fetch":[104],"decode":[106],"successive":[107],"instructions":[108,125],"identify":[110],"any":[111],"independent":[112,124],"instruction":[113],"not":[116],"long":[119],"dependence":[121],"chain.":[122],"These":[123],"are":[126,139],"then":[127],"pre-executed.":[128],"To":[129],"tackle":[130],"write-after-write":[131],"write-after-read":[133],"hazards,":[134],"during":[135],"output":[137],"values":[138],"written":[140],"renamed":[142,180],"physical":[143],"registers.":[144,181],"We":[145],"exploit":[146],"register":[148],"file":[149],"underutilization":[150],"re-purpose":[152],"few":[154],"unused":[155],"registers":[156],"store":[158],"results.":[161],"When":[162],"switched":[166],"from":[167],"normal":[170],"execution":[171],"mode":[172],"reuses":[174],"pre-executed":[175],"results":[176,208],"by":[177],"reading":[178],"Any":[182],"global":[183],"load":[184],"transformed":[189],"into":[190,196],"pre-load":[192],"which":[193],"fetches":[194],"data":[195],"L1":[198],"cache":[199],"reduce":[201],"future":[202],"access":[204],"penalties.":[205],"Our":[206],"evaluation":[207],"show":[209],"23%":[210],"performance":[211],"improvement":[212],"intensive":[215],"applications,":[216],"without":[217],"negatively":[218],"impacting":[219],"other":[220],"application":[221],"categories.":[222]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":8},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":6},{"year":2018,"cited_by_count":7},{"year":2017,"cited_by_count":11},{"year":2016,"cited_by_count":5}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
