{"id":"https://openalex.org/W2049875313","doi":"https://doi.org/10.1145/2063384.2063400","title":"CudaDMA","display_name":"CudaDMA","publication_year":2011,"publication_date":"2011-11-08","ids":{"openalex":"https://openalex.org/W2049875313","doi":"https://doi.org/10.1145/2063384.2063400","mag":"2049875313"},"language":"en","primary_location":{"id":"doi:10.1145/2063384.2063400","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063400","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101680564","display_name":"Michael Bauer","orcid":"https://orcid.org/0009-0000-5475-7293"},"institutions":[{"id":"https://openalex.org/I97018004","display_name":"Stanford University","ror":"https://ror.org/00f54p054","country_code":"US","type":"education","lineage":["https://openalex.org/I97018004"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Michael Bauer","raw_affiliation_strings":["Stanford University"],"affiliations":[{"raw_affiliation_string":"Stanford University","institution_ids":["https://openalex.org/I97018004"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048743616","display_name":"Henry Cook","orcid":null},"institutions":[{"id":"https://openalex.org/I95457486","display_name":"University of California, Berkeley","ror":"https://ror.org/01an7q238","country_code":"US","type":"education","lineage":["https://openalex.org/I95457486"]},{"id":"https://openalex.org/I134446601","display_name":"Berkeley College","ror":"https://ror.org/02xewxa75","country_code":"US","type":"education","lineage":["https://openalex.org/I134446601"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Henry Cook","raw_affiliation_strings":["UC Berkeley"],"affiliations":[{"raw_affiliation_string":"UC Berkeley","institution_ids":["https://openalex.org/I134446601","https://openalex.org/I95457486"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5010156116","display_name":"Brucek Khailany","orcid":"https://orcid.org/0000-0002-7584-3489"},"institutions":[{"id":"https://openalex.org/I1304085615","display_name":"Nvidia (United Kingdom)","ror":"https://ror.org/02kr42612","country_code":"GB","type":"company","lineage":["https://openalex.org/I1304085615","https://openalex.org/I4210127875"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Brucek Khailany","raw_affiliation_strings":["NVIDIA Research"],"affiliations":[{"raw_affiliation_string":"NVIDIA Research","institution_ids":["https://openalex.org/I1304085615"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5101680564"],"corresponding_institution_ids":["https://openalex.org/I97018004"],"apc_list":null,"apc_paid":null,"fwci":13.7176,"has_fulltext":false,"cited_by_count":153,"citation_normalized_percentile":{"value":0.99061861,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"11"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8778007626533508},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7171877026557922},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7020808458328247},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.502328634262085},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.5016036033630371},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4974196255207062},{"id":"https://openalex.org/keywords/yarn","display_name":"Yarn","score":0.42949575185775757},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3811039626598358},{"id":"https://openalex.org/keywords/channel","display_name":"Channel (broadcasting)","score":0.15382346510887146}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8778007626533508},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7171877026557922},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7020808458328247},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.502328634262085},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.5016036033630371},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4974196255207062},{"id":"https://openalex.org/C2778787235","wikidata":"https://www.wikidata.org/wiki/Q49007","display_name":"Yarn","level":2,"score":0.42949575185775757},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3811039626598358},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.15382346510887146},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2063384.2063400","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2063384.2063400","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Decent work and economic growth","score":0.4399999976158142,"id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G3686759057","display_name":null,"funder_award_id":"24263","funder_id":"https://openalex.org/F4320307764","funder_display_name":"Microsoft"},{"id":"https://openalex.org/G4384726126","display_name":null,"funder_award_id":"24894","funder_id":"https://openalex.org/F4320307102","funder_display_name":"Intel Corporation"},{"id":"https://openalex.org/G6835067985","display_name":null,"funder_award_id":"DIG07-10227","funder_id":"https://openalex.org/F4320308590","funder_display_name":"University of California"},{"id":"https://openalex.org/G734899754","display_name":null,"funder_award_id":"HR0011-10-9-0008","funder_id":"https://openalex.org/F4320332180","funder_display_name":"Defense Advanced Research Projects Agency"}],"funders":[{"id":"https://openalex.org/F4320307102","display_name":"Intel Corporation","ror":"https://ror.org/01ek73717"},{"id":"https://openalex.org/F4320307764","display_name":"Microsoft","ror":"https://ror.org/00d0nc645"},{"id":"https://openalex.org/F4320308590","display_name":"University of California","ror":"https://ror.org/00pjdza24"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"},{"id":"https://openalex.org/F4320332180","display_name":"Defense Advanced Research Projects Agency","ror":"https://ror.org/02caytj08"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":15,"referenced_works":["https://openalex.org/W1568192366","https://openalex.org/W2002555321","https://openalex.org/W2017086619","https://openalex.org/W2088943630","https://openalex.org/W2109473404","https://openalex.org/W2109882966","https://openalex.org/W2113190809","https://openalex.org/W2118324422","https://openalex.org/W2138163628","https://openalex.org/W2146959317","https://openalex.org/W2160454978","https://openalex.org/W2164106630","https://openalex.org/W2170879098","https://openalex.org/W2561675875","https://openalex.org/W6633991270"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W98480971","https://openalex.org/W2150291671","https://openalex.org/W2027972911","https://openalex.org/W2146343568","https://openalex.org/W2013643406","https://openalex.org/W2983282793","https://openalex.org/W4376647684","https://openalex.org/W4221139464"],"abstract_inverted_index":{"As":[0],"the":[1,73,85],"computational":[2],"power":[3],"of":[4,15,87,112],"GPUs":[5,29],"continues":[6],"to":[7,79,114],"scale":[8],"with":[9,30],"Moore's":[10],"Law,":[11],"an":[12,25,92],"increasing":[13],"number":[14],"applications":[16,127],"are":[17],"becoming":[18],"limited":[19],"by":[20,51,58,71],"memory":[21,37,48],"bandwidth.":[22],"We":[23],"propose":[24],"approach":[26],"for":[27,35,75],"programming":[28],"tightly-coupled":[31],"specialized":[32],"DMA":[33,45,65],"warps":[34,46,66],"performing":[36],"transfers":[38],"between":[39],"on-chip":[40],"and":[41,57,99,102,120],"off-chip":[42],"memories.":[43],"Separate":[44],"improve":[47,68],"bandwidth":[49],"utilization":[50],"better":[52],"exploiting":[53],"available":[54],"memory-level":[55],"parallelism":[56],"leveraging":[59],"efficient":[60],"inter-warp":[61],"producer-consumer":[62],"synchronization":[63,98],"mechanisms.":[64],"also":[67],"programmer":[69],"productivity":[70],"decoupling":[72],"need":[74],"thread":[76],"array":[77],"shapes":[78],"match":[80],"data":[81,104],"layout.":[82],"To":[83],"illustrate":[84],"benefits":[86],"this":[88],"approach,":[89],"we":[90,109],"present":[91],"extensible":[93],"API,":[94],"CudaDMA,":[95,108],"that":[96],"encapsulates":[97],"common":[100],"sequential":[101],"strided":[103],"transfer":[105],"patterns.":[106],"Using":[107],"demonstrate":[110],"speedup":[111],"up":[113],"1.37x":[115],"on":[116,122,132],"representative":[117],"synthetic":[118],"microbenchmarks,":[119],"1.15x-3.2x":[121],"several":[123],"kernels":[124],"from":[125],"scientific":[126],"written":[128],"in":[129],"CUDA":[130],"running":[131],"NVIDIA":[133],"Fermi":[134],"GPUs.":[135]},"counts_by_year":[{"year":2026,"cited_by_count":5},{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":7},{"year":2020,"cited_by_count":8},{"year":2019,"cited_by_count":12},{"year":2018,"cited_by_count":10},{"year":2017,"cited_by_count":12},{"year":2016,"cited_by_count":12},{"year":2015,"cited_by_count":17},{"year":2014,"cited_by_count":20},{"year":2013,"cited_by_count":21},{"year":2012,"cited_by_count":12}],"updated_date":"2026-03-12T08:34:05.389933","created_date":"2016-06-24T00:00:00"}
