{"id":"https://openalex.org/W2766362889","doi":"https://doi.org/10.1145/3146347.3146356","title":"An In-depth Performance Characterization of CPU- and GPU-based DNN Training on Modern Architectures","display_name":"An In-depth Performance Characterization of CPU- and GPU-based DNN Training on Modern Architectures","publication_year":2017,"publication_date":"2017-10-31","ids":{"openalex":"https://openalex.org/W2766362889","doi":"https://doi.org/10.1145/3146347.3146356","mag":"2766362889"},"language":"en","primary_location":{"id":"doi:10.1145/3146347.3146356","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3146347.3146356","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Machine Learning on HPC Environments","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004330728","display_name":"Ammar Ahmad Awan","orcid":"https://orcid.org/0000-0002-6272-3760"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Ammar Ahmad Awan","raw_affiliation_strings":["Dept. of Computer Science and Engg., The Ohio State University"],"affiliations":[{"raw_affiliation_string":"Dept. of Computer Science and Engg., The Ohio State University","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034293705","display_name":"Hari Subramoni","orcid":"https://orcid.org/0000-0002-1200-2754"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Hari Subramoni","raw_affiliation_strings":["Dept. of Computer Science and Engg., The Ohio State University"],"affiliations":[{"raw_affiliation_string":"Dept. of Computer Science and Engg., The Ohio State University","institution_ids":["https://openalex.org/I52357470"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5024879682","display_name":"Dhabaleswar K. Panda","orcid":"https://orcid.org/0000-0002-0356-1781"},"institutions":[{"id":"https://openalex.org/I52357470","display_name":"The Ohio State University","ror":"https://ror.org/00rs6vg23","country_code":"US","type":"education","lineage":["https://openalex.org/I52357470"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Dhabaleswar K. Panda","raw_affiliation_strings":["Dept. of Computer Science and Engg., The Ohio State University"],"affiliations":[{"raw_affiliation_string":"Dept. of Computer Science and Engg., The Ohio State University","institution_ids":["https://openalex.org/I52357470"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5004330728"],"corresponding_institution_ids":["https://openalex.org/I52357470"],"apc_list":null,"apc_paid":null,"fwci":2.3666,"has_fulltext":false,"cited_by_count":75,"citation_normalized_percentile":{"value":0.93656753,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":97,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"8"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.9977999925613403,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.994700014591217,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.853983998298645},{"id":"https://openalex.org/keywords/xeon","display_name":"Xeon","score":0.6497142314910889},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5816293954849243},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5470472574234009},{"id":"https://openalex.org/keywords/central-processing-unit","display_name":"Central processing unit","score":0.5451804995536804},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.5290464758872986},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4872986078262329},{"id":"https://openalex.org/keywords/xeon-phi","display_name":"Xeon Phi","score":0.4574843943119049},{"id":"https://openalex.org/keywords/pascal","display_name":"Pascal (unit)","score":0.45663005113601685},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.446972519159317},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.4171142578125},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3966621160507202},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.22571349143981934}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.853983998298645},{"id":"https://openalex.org/C145108525","wikidata":"https://www.wikidata.org/wiki/Q656154","display_name":"Xeon","level":2,"score":0.6497142314910889},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5816293954849243},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5470472574234009},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.5451804995536804},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.5290464758872986},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4872986078262329},{"id":"https://openalex.org/C96972482","wikidata":"https://www.wikidata.org/wiki/Q1049168","display_name":"Xeon Phi","level":2,"score":0.4574843943119049},{"id":"https://openalex.org/C75608658","wikidata":"https://www.wikidata.org/wiki/Q44395","display_name":"Pascal (unit)","level":2,"score":0.45663005113601685},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.446972519159317},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.4171142578125},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3966621160507202},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.22571349143981934},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3146347.3146356","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3146347.3146356","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Machine Learning on HPC Environments","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"},{"id":"https://openalex.org/F4320309480","display_name":"Nvidia","ror":"https://ror.org/03jdj4y14"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W1598866093","https://openalex.org/W2097117768","https://openalex.org/W2108598243","https://openalex.org/W2155893237","https://openalex.org/W2163605009","https://openalex.org/W2194775991","https://openalex.org/W2271840356","https://openalex.org/W2475636809","https://openalex.org/W2514858228","https://openalex.org/W2580688187","https://openalex.org/W3118608800","https://openalex.org/W6819060087"],"related_works":["https://openalex.org/W2475524688","https://openalex.org/W2739740241","https://openalex.org/W2085105049","https://openalex.org/W2592417500","https://openalex.org/W1974923383","https://openalex.org/W2526069705","https://openalex.org/W2024016913","https://openalex.org/W2019153376","https://openalex.org/W2796552083","https://openalex.org/W2145394070"],"abstract_inverted_index":{"Traditionally,":[0],"Deep":[1],"Learning":[2,121],"(DL)":[3],"frameworks":[4,35],"like":[5,36,225],"Caffe,":[6],"TensorFlow,":[7],"and":[8,38,46,71,83,90,102,116,125,141,218,220,227],"Cognitive":[9],"Toolkit":[10],"exploited":[11],"GPUs":[12],"to":[13,43,51,192,203,208,232,239],"accelerate":[14],"the":[15,49,54,94,144,151,158,187],"training":[16,80,111,138,162,201],"process.":[17],"This":[18],"has":[19,48],"been":[20],"primarily":[21],"achieved":[22],"by":[23],"aggressive":[24],"improvements":[25],"in":[26,163,196],"parallel":[27],"hardware":[28,45,217],"as":[29,31],"well":[30],"through":[32],"sophisticated":[33],"software":[34,47],"cuDNN":[37],"cuBLAS.":[39],"However,":[40],"recent":[41],"enhancements":[42],"CPU-based":[44,57,223],"potential":[50],"significantly":[52],"enhance":[53],"performance":[55,67,77,112,159,174,206],"of":[56,69,78,88,146,160,189,215],"DL":[58],"training.":[59,74],"In":[60,127],"this":[61,149],"paper,":[62],"we":[63,129],"provide":[64,130,179],"a":[65,86,131,164],"complete":[66],"landscape":[68],"CPU-":[70],"GPU-based":[72,200],"DNN":[73,79,110,161,197],"We":[75,106,178],"characterize":[76],"for":[81,85,114,136,175,186,242],"AlexNet":[82,115,243],"ResNet-50":[84,117],"wide-range":[87],"CPU":[89,132],"GPU":[91,134,216],"architectures":[92],"including":[93],"latest":[95],"Intel":[96,119],"Xeon":[97],"Phi":[98],"(Knights":[99],"Landing)":[100],"processors":[101],"NVIDIA":[103],"Pascal":[104],"GPUs.":[105],"also":[107],"present":[108],"multi-node":[109,137],"results":[113],"using":[118,139],"Machine":[120],"Scaling":[122],"(MLSL)":[123],"Library":[124],"Intel-Caffe.":[126,142],"addition,":[128],"vs.":[133],"comparison":[135],"OSU-Caffe":[140],"To":[143],"best":[145],"our":[147],"knowledge,":[148],"is":[150],"first":[152],"study":[153],"that":[154],"dives":[155],"deeper":[156],"into":[157],"holistic":[165],"manner":[166],"yet":[167],"provides":[168],"an":[169],"in-depth":[170],"look":[171],"at":[172],"layer-wise":[173],"different":[176],"DNNs.":[177],"multiple":[180],"key":[181],"insights:":[182],"1)":[183],"Convolutions":[184],"account":[185],"majority":[188],"time":[190],"(up":[191,207,238],"83%":[193],"time)":[194],"consumed":[195],"training,":[198],"2)":[199],"continues":[202],"deliver":[204],"excellent":[205,233],"18%":[209],"better":[210],"than":[211],"KNL)":[212],"across":[213],"generations":[214],"software,":[219],"3)":[221],"Recent":[222],"optimizations":[224],"MKL-DNN":[226],"OpenMP-based":[228],"thread":[229],"parallelism":[230],"leads":[231],"speed-ups":[234],"over":[235],"under-optimized":[236],"designs":[237],"3.2X":[240],"improvement":[241],"training).":[244]},"counts_by_year":[{"year":2025,"cited_by_count":10},{"year":2024,"cited_by_count":9},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":6},{"year":2021,"cited_by_count":19},{"year":2020,"cited_by_count":7},{"year":2019,"cited_by_count":15},{"year":2018,"cited_by_count":4}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
