{"id":"https://openalex.org/W2236252626","doi":"https://doi.org/10.1145/2830772.2830796","title":"Efficient warp execution in presence of divergence with collaborative context collection","display_name":"Efficient warp execution in presence of divergence with collaborative context collection","publication_year":2015,"publication_date":"2015-12-05","ids":{"openalex":"https://openalex.org/W2236252626","doi":"https://doi.org/10.1145/2830772.2830796","mag":"2236252626"},"language":"en","primary_location":{"id":"doi:10.1145/2830772.2830796","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2830772.2830796","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2830772.2830796","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International Symposium on Microarchitecture","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/2830772.2830796","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5036836220","display_name":"Farzad Khorasani","orcid":null},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Farzad Khorasani","raw_affiliation_strings":["University of California, Riverside, CA"],"affiliations":[{"raw_affiliation_string":"University of California, Riverside, CA","institution_ids":["https://openalex.org/I103635307"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100699251","display_name":"Rajiv Gupta","orcid":"https://orcid.org/0000-0002-9348-3974"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rajiv Gupta","raw_affiliation_strings":["University of California, Riverside, CA"],"affiliations":[{"raw_affiliation_string":"University of California, Riverside, CA","institution_ids":["https://openalex.org/I103635307"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5048949780","display_name":"Laxmi N. Bhuyan","orcid":"https://orcid.org/0000-0002-8759-0458"},"institutions":[{"id":"https://openalex.org/I103635307","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43","country_code":"US","type":"education","lineage":["https://openalex.org/I103635307"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Laxmi N. Bhuyan","raw_affiliation_strings":["University of California, Riverside, CA"],"affiliations":[{"raw_affiliation_string":"University of California, Riverside, CA","institution_ids":["https://openalex.org/I103635307"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5036836220"],"corresponding_institution_ids":["https://openalex.org/I103635307"],"apc_list":null,"apc_paid":null,"fwci":6.5971,"has_fulltext":true,"cited_by_count":34,"citation_normalized_percentile":{"value":0.97106973,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":94,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"204","last_page":"215"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9980999827384949,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8674681186676025},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7893720865249634},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.6750968098640442},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5920443534851074},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.5575736165046692},{"id":"https://openalex.org/keywords/control-flow","display_name":"Control flow","score":0.5255511999130249},{"id":"https://openalex.org/keywords/simd","display_name":"SIMD","score":0.4732019603252411},{"id":"https://openalex.org/keywords/context-switch","display_name":"Context switch","score":0.44418448209762573},{"id":"https://openalex.org/keywords/instruction-set","display_name":"Instruction set","score":0.4283791184425354},{"id":"https://openalex.org/keywords/multithreading","display_name":"Multithreading","score":0.4264149069786072},{"id":"https://openalex.org/keywords/execution-model","display_name":"Execution model","score":0.4212344288825989},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4197118282318115},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.323000431060791},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.3218782842159271}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8674681186676025},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7893720865249634},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.6750968098640442},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5920443534851074},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.5575736165046692},{"id":"https://openalex.org/C160191386","wikidata":"https://www.wikidata.org/wiki/Q868299","display_name":"Control flow","level":2,"score":0.5255511999130249},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.4732019603252411},{"id":"https://openalex.org/C53833338","wikidata":"https://www.wikidata.org/wiki/Q1061424","display_name":"Context switch","level":2,"score":0.44418448209762573},{"id":"https://openalex.org/C202491316","wikidata":"https://www.wikidata.org/wiki/Q272683","display_name":"Instruction set","level":2,"score":0.4283791184425354},{"id":"https://openalex.org/C201410400","wikidata":"https://www.wikidata.org/wiki/Q1064412","display_name":"Multithreading","level":3,"score":0.4264149069786072},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.4212344288825989},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4197118282318115},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.323000431060791},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3218782842159271}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/2830772.2830796","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2830772.2830796","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2830772.2830796","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International Symposium on Microarchitecture","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.709.9926","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.709.9926","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.ucr.edu/%7Egupta/research/Publications/Comp/micro-48.pdf","raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/2830772.2830796","is_oa":true,"landing_page_url":"https://doi.org/10.1145/2830772.2830796","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/2830772.2830796","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 48th International Symposium on Microarchitecture","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1561979438","display_name":"SHF: Small: Transformations for Synergistic Analysis of Large Evolving Graphs","funder_award_id":"1524852","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G18250200","display_name":"EAGER: Developing a Programming Environment for Heterogenous Multiprocessors","funder_award_id":"1157377","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2646760259","display_name":null,"funder_award_id":"CCF-1318103, CCF-1524852","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2852361309","display_name":"SHF: Small: Efficient CPU-GPU Communication for Heterogeneous Architectures","funder_award_id":"1423108","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G2940312049","display_name":null,"funder_award_id":"CCF-1318103","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G31502264","display_name":"SHF: Small: Memory Consistency -- Hardware, Compiler, and Programming Support","funder_award_id":"1318103","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G4684923161","display_name":"SHF: Medium: Hardware/Software Partitioning for Hybrid Shared Memory Multiprocessors","funder_award_id":"0905509","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"},{"id":"https://openalex.org/G8002895364","display_name":null,"funder_award_id":"CCF-0905509,CNS-1157377,CCF-1318103,CCF-1524852,CCF-1423108","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320332550","display_name":"University of California, Riverside","ror":"https://ror.org/03nawhv43"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2236252626.pdf","grobid_xml":"https://content.openalex.org/works/W2236252626.grobid-xml"},"referenced_works_count":51,"referenced_works":["https://openalex.org/W27773700","https://openalex.org/W142653777","https://openalex.org/W164384110","https://openalex.org/W579519726","https://openalex.org/W981516807","https://openalex.org/W1504291959","https://openalex.org/W1919570435","https://openalex.org/W1965061255","https://openalex.org/W1970815868","https://openalex.org/W1971997351","https://openalex.org/W1973538724","https://openalex.org/W1978155891","https://openalex.org/W1985291160","https://openalex.org/W1994316441","https://openalex.org/W1994688997","https://openalex.org/W2012630996","https://openalex.org/W2013247896","https://openalex.org/W2016706026","https://openalex.org/W2043420024","https://openalex.org/W2061313045","https://openalex.org/W2067313328","https://openalex.org/W2075739954","https://openalex.org/W2090495704","https://openalex.org/W2090584832","https://openalex.org/W2094945791","https://openalex.org/W2098505406","https://openalex.org/W2123440268","https://openalex.org/W2125979435","https://openalex.org/W2132598270","https://openalex.org/W2135947393","https://openalex.org/W2144061463","https://openalex.org/W2146591355","https://openalex.org/W2148443481","https://openalex.org/W2151686327","https://openalex.org/W2155568054","https://openalex.org/W2156180003","https://openalex.org/W2156831150","https://openalex.org/W2167675119","https://openalex.org/W2168921806","https://openalex.org/W2169880332","https://openalex.org/W2171399035","https://openalex.org/W2295329047","https://openalex.org/W2432978112","https://openalex.org/W2492496105","https://openalex.org/W2748306984","https://openalex.org/W3006582303","https://openalex.org/W3013490664","https://openalex.org/W3146509083","https://openalex.org/W4236883517","https://openalex.org/W4300125772","https://openalex.org/W6640247899"],"related_works":["https://openalex.org/W1995705225","https://openalex.org/W4248655967","https://openalex.org/W2138520521","https://openalex.org/W2184902834","https://openalex.org/W2110105483","https://openalex.org/W2107831078","https://openalex.org/W4248145683","https://openalex.org/W1672168401","https://openalex.org/W2100579514","https://openalex.org/W2156983793"],"abstract_inverted_index":{"GPU's":[0],"SIMD":[1],"architecture":[2],"is":[3],"a":[4,20,56,97,158],"double-edged":[5],"sword":[6],"confronting":[7],"parallel":[8],"tasks":[9],"with":[10,73,133],"control":[11],"flow":[12],"divergence.":[13,135,154],"On":[14],"the":[15,34,42,67,90,102,111,142,185],"one":[16],"hand,":[17,36],"it":[18],"provides":[19],"high":[21],"performance":[22],"yet":[23],"power-efficient":[24],"platform":[25],"to":[26,41,123,128,140,147,165,194],"accelerate":[27],"applications":[28,176],"via":[29],"massive":[30],"parallelism;":[31],"however,":[32],"on":[33,174],"other":[35],"irregularities":[37],"induce":[38],"inefficiencies":[39],"due":[40],"warp's":[43],"lockstep":[44],"traversal":[45],"of":[46,93,114,126,130,144,163,189,201],"all":[47],"diverging":[48],"execution":[49,69,187],"paths.":[50],"In":[51],"this":[52],"work,":[53],"we":[54],"present":[55],"software":[57],"(compiler)":[58],"technique":[59],"named":[60],"Collaborative":[61],"Context":[62],"Collection":[63],"(CCC)":[64],"that":[65,160],"increases":[66],"warp":[68,115,186],"efficiency":[70,188],"when":[71,110],"faced":[72],"thread":[74,134],"divergence":[75],"incurred":[76],"either":[77],"by":[78,84,192],"different":[79],"intra-warp":[80,85],"task":[81],"assignment":[82],"or":[83,152],"load":[86],"imbalance.":[87],"CCC":[88,127,145,164,173,183],"collects":[89],"relevant":[91],"registers":[92],"divergent":[94],"threads":[95],"in":[96,101],"warp-specific":[98],"stack":[99],"allocated":[100],"fast":[103],"shared":[104],"memory,":[105],"and":[106,146,177,196],"restores":[107],"them":[108],"only":[109],"perfect":[112],"utilization":[113],"lanes":[116],"becomes":[117],"feasible.":[118],"We":[119,136,155,171],"propose":[120],"code":[121],"transformations":[122],"enable":[124],"applicability":[125],"variety":[129],"program":[131],"segments":[132],"also":[137],"introduce":[138],"optimizations":[139],"reduce":[141],"cost":[143],"avoid":[148],"device":[149],"occupancy":[150],"limitation":[151],"memory":[153],"have":[156],"developed":[157],"framework":[159],"automates":[161],"application":[162],"CUDA":[166],"generated":[167],"intermediate":[168],"PTX":[169],"code.":[170],"evaluated":[172],"real-world":[175,190],"multiple":[178],"scenarios":[179],"using":[180],"synthetic":[181],"programs.":[182],"improves":[184],"benchmarks":[191],"up":[193],"56%":[195],"achieves":[197],"an":[198],"average":[199],"speedup":[200],"1.69x":[202],"(maximum":[203],"3.08x).":[204]},"counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":2},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":4},{"year":2018,"cited_by_count":8},{"year":2017,"cited_by_count":9},{"year":2016,"cited_by_count":3}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
