{"id":"https://openalex.org/W2088754683","doi":"https://doi.org/10.1145/2579617","title":"Leveraging GPUs using cooperative loop speculation","display_name":"Leveraging GPUs using cooperative loop speculation","publication_year":2014,"publication_date":"2014-02-01","ids":{"openalex":"https://openalex.org/W2088754683","doi":"https://doi.org/10.1145/2579617","mag":"2088754683"},"language":"en","primary_location":{"id":"doi:10.1145/2579617","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2579617","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2579617","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://dl.acm.org/doi/pdf/10.1145/2579617","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058642630","display_name":"Mehrzad Samadi","orcid":"https://orcid.org/0000-0002-3581-1255"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Mehrzad Samadi","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071212745","display_name":"Amir Hormati","orcid":"https://orcid.org/0009-0002-5786-3301"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Amir Hormati","raw_affiliation_strings":["Google Inc., Seattle, WA"],"affiliations":[{"raw_affiliation_string":"Google Inc., Seattle, WA","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5065946847","display_name":"Janghaeng Lee","orcid":"https://orcid.org/0009-0005-1915-6077"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Janghaeng Lee","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI","institution_ids":["https://openalex.org/I27837315"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5002075773","display_name":"Scott Mahlke","orcid":"https://orcid.org/0000-0002-0438-0616"},"institutions":[{"id":"https://openalex.org/I27837315","display_name":"University of Michigan\u2013Ann Arbor","ror":"https://ror.org/00jmfr291","country_code":"US","type":"education","lineage":["https://openalex.org/I27837315"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Scott Mahlke","raw_affiliation_strings":["University of Michigan, Ann Arbor, MI"],"affiliations":[{"raw_affiliation_string":"University of Michigan, Ann Arbor, MI","institution_ids":["https://openalex.org/I27837315"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5058642630"],"corresponding_institution_ids":["https://openalex.org/I27837315"],"apc_list":null,"apc_paid":null,"fwci":0.315,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.61878783,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":94},"biblio":{"volume":"11","issue":"1","first_page":"1","last_page":"26"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9970999956130981,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.9177353382110596},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.8260345458984375},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.8054367303848267},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6595492959022522},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.6164953112602234},{"id":"https://openalex.org/keywords/central-processing-unit","display_name":"Central processing unit","score":0.5313127040863037},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5278487205505371},{"id":"https://openalex.org/keywords/speculative-execution","display_name":"Speculative execution","score":0.44657060503959656},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.42543458938598633},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.22447702288627625}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9177353382110596},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.8260345458984375},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.8054367303848267},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6595492959022522},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.6164953112602234},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.5313127040863037},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5278487205505371},{"id":"https://openalex.org/C141331961","wikidata":"https://www.wikidata.org/wiki/Q2164465","display_name":"Speculative execution","level":2,"score":0.44657060503959656},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.42543458938598633},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.22447702288627625}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/2579617","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2579617","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2579617","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.648.2393","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.648.2393","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://cccp.eecs.umich.edu/papers/samadi-taco14.pdf","raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/2579617","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2579617","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2579617","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2088754683.pdf","grobid_xml":"https://content.openalex.org/works/W2088754683.grobid-xml"},"referenced_works_count":45,"referenced_works":["https://openalex.org/W1507552563","https://openalex.org/W1517684295","https://openalex.org/W1604019176","https://openalex.org/W1715366078","https://openalex.org/W1782174992","https://openalex.org/W1980999196","https://openalex.org/W1982123268","https://openalex.org/W1994316441","https://openalex.org/W2001784723","https://openalex.org/W2008258200","https://openalex.org/W2016888570","https://openalex.org/W2031775479","https://openalex.org/W2037462607","https://openalex.org/W2038198320","https://openalex.org/W2083056254","https://openalex.org/W2097061393","https://openalex.org/W2097447543","https://openalex.org/W2097717378","https://openalex.org/W2098426571","https://openalex.org/W2101209730","https://openalex.org/W2102922928","https://openalex.org/W2106579652","https://openalex.org/W2107497336","https://openalex.org/W2112482891","https://openalex.org/W2116170858","https://openalex.org/W2117610141","https://openalex.org/W2117689653","https://openalex.org/W2123790465","https://openalex.org/W2123845384","https://openalex.org/W2126501440","https://openalex.org/W2128329055","https://openalex.org/W2133734540","https://openalex.org/W2134616937","https://openalex.org/W2146081545","https://openalex.org/W2148255965","https://openalex.org/W2152657019","https://openalex.org/W2153492376","https://openalex.org/W2159481344","https://openalex.org/W2163229756","https://openalex.org/W2164106630","https://openalex.org/W2168921806","https://openalex.org/W2178550486","https://openalex.org/W3007272028","https://openalex.org/W3203568064","https://openalex.org/W4238982306"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W2027972911","https://openalex.org/W2146343568","https://openalex.org/W2013643406","https://openalex.org/W2983282793","https://openalex.org/W2778498407","https://openalex.org/W3000570965"],"abstract_inverted_index":{"Graphics":[0],"processing":[1],"units,":[2],"or":[3],"GPUs,":[4,178],"provide":[5],"TFLOPs":[6],"of":[7,26,42,73,81,113,150,242],"additional":[8],"performance":[9,61],"potential":[10],"in":[11,90,132,186],"commodity":[12],"computer":[13],"systems":[14],"that":[15,64,208],"frequently":[16],"go":[17],"unused":[18],"by":[19,54],"most":[20],"applications.":[21,247],"Even":[22],"with":[23,121,224],"the":[24,45,59,79,117,122,130,142,148,154,157,162,167,181,184,194,197,200,203],"emergence":[25],"languages":[27],"such":[28,126],"as":[29,56,58],"CUDA":[30],"and":[31,49,144,164,179,214,227,231],"OpenCL,":[32],"programming":[33],"GPUs":[34,82],"remains":[35],"a":[36,40,103,138,170,188,240],"difficult":[37],"challenge":[38],"for":[39,156,177],"variety":[41],"reasons,":[43],"including":[44],"inherent":[46],"algorithmic":[47],"characteristics":[48],"data":[50],"structure":[51],"choices":[52],"used":[53],"applications":[55,88,115],"well":[57],"tedious":[60],"optimization":[62],"cycle":[63],"is":[65,76,190],"necessary":[66],"to":[67,77,85,107,146,183,216,220,233],"achieve":[68,96],"high":[69],"performance.":[70],"The":[71],"goal":[72],"this":[74,97],"work":[75],"increase":[78],"applicability":[80],"beyond":[83],"CUDA/OpenCL":[84],"implicitly":[86,245],"data-parallel":[87,111,246],"written":[89],"C/C++":[91],"using":[92,169],"speculative":[93],"parallelization.":[94],"To":[95],"goal,":[98],"we":[99],"propose":[100],"Paragon":[101,128,152,192,209],":":[102],"static/dynamic":[104],"compiler":[105],"platform":[106],"speculatively":[108,160],"run":[109],"possibly":[110],"portions":[112],"sequential":[114,237,243],"on":[116,161,166,196,212,229],"GPU":[118,131,145,163,198],"while":[119,136],"cooperating":[120],"system":[123],"CPU.":[124],"For":[125],"loops,":[127],"utilizes":[129],"an":[133],"opportunistic":[134],"way":[135],"orchestrating":[137],"cooperative":[139],"relation":[140],"between":[141],"CPU":[143,168,185,201,222],"reduce":[147],"overhead":[149],"miss-speculations.":[151],"monitors":[153],"dependencies":[155],"loops":[158],"running":[159],"nonspeculatively":[165],"lightweight":[171],"distributed":[172],"conflict":[173,189],"detection":[174],"designed":[175],"specifically":[176],"transfers":[180],"execution":[182,195,223,238],"case":[187],"detected.":[191],"resumes":[193],"after":[199],"resolves":[202],"dependency.":[204],"Our":[205],"experiments":[206],"show":[207],"achieves":[210],"4x":[211],"average":[213,230],"up":[215,232],"30x":[217],"speedup":[218,235],"compared":[219],"unsafe":[221],"four":[225],"threads":[226],"7x":[228],"64x":[234],"versus":[236],"across":[239],"set":[241],"but":[244]},"counts_by_year":[{"year":2016,"cited_by_count":1}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
