{"id":"https://openalex.org/W7101450441","doi":"https://doi.org/10.1145/3773039","title":"PPD: A Portable and Highly Parallel Dispatching System for Deep Learning","display_name":"PPD: A Portable and Highly Parallel Dispatching System for Deep Learning","publication_year":2025,"publication_date":"2025-10-27","ids":{"openalex":"https://openalex.org/W7101450441","doi":"https://doi.org/10.1145/3773039"},"language":"en","primary_location":{"id":"doi:10.1145/3773039","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3773039","pdf_url":null,"source":{"id":"https://openalex.org/S105046310","display_name":"ACM Transactions on Design Automation of Electronic Systems","issn_l":"1084-4309","issn":["1084-4309","1557-7309"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Design Automation of Electronic Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Wendong Xu","orcid":"https://orcid.org/0000-0002-5857-7393"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":true,"raw_author_name":"Wendong Xu","raw_affiliation_strings":["Electrical and Electronic Engineering, The University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Electrical and Electronic Engineering, The University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuhao Ji","orcid":"https://orcid.org/0009-0008-9376-753X"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuhao Ji","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yang Bai","orcid":"https://orcid.org/0000-0002-5337-1783"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yang Bai","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yueting Li","orcid":"https://orcid.org/0000-0001-8874-6269"},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yueting Li","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yuxuan Zhao","orcid":"https://orcid.org/0000-0001-5995-4763"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yuxuan Zhao","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhengwu Liu","orcid":"https://orcid.org/0000-0001-7968-9469"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Zhengwu Liu","raw_affiliation_strings":["Electrical and Electronic Engineering, The University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Electrical and Electronic Engineering, The University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Bei Yu","orcid":"https://orcid.org/0000-0001-6406-4810"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Bei Yu","raw_affiliation_strings":["Department of Computer Science and Engineering, The Chinese University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science and Engineering, The Chinese University of Hong Kong","institution_ids":["https://openalex.org/I177725633"]}]},{"author_position":"last","author":{"id":null,"display_name":"Ngai Wong","orcid":"https://orcid.org/0000-0002-3026-0108"},"institutions":[{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Ngai Wong","raw_affiliation_strings":["Electrical and Electronic Engineering, The University of Hong Kong"],"affiliations":[{"raw_affiliation_string":"Electrical and Electronic Engineering, The University of Hong Kong","institution_ids":["https://openalex.org/I889458895"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.5697112,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"31","issue":"2","first_page":"1","last_page":"24"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.28540000319480896,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.28540000319480896,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.23999999463558197,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.09440000355243683,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.8371000289916992},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7272999882698059},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.6873999834060669},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.524399995803833},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.501800000667572},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.49939998984336853},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.4666999876499176},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.45190000534057617},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.4207000136375427}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9143000245094299},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.8371000289916992},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7272999882698059},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.6873999834060669},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.524399995803833},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5133000016212463},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.501800000667572},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.49939998984336853},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.47029998898506165},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.4666999876499176},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.45190000534057617},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.4207000136375427},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.4124000072479248},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3955000042915344},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.3831999897956848},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C190475519","wikidata":"https://www.wikidata.org/wiki/Q544384","display_name":"Massively parallel","level":2,"score":0.34369999170303345},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.33480000495910645},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3151000142097473},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.30250000953674316},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.2897000014781952},{"id":"https://openalex.org/C1306188","wikidata":"https://www.wikidata.org/wiki/Q4060687","display_name":"Nested loop join","level":2,"score":0.2883000075817108},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.288100004196167},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.28679999709129333},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.2759999930858612},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.27390000224113464},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.2669000029563904}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3773039","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3773039","pdf_url":null,"source":{"id":"https://openalex.org/S105046310","display_name":"ACM Transactions on Design Automation of Electronic Systems","issn_l":"1084-4309","issn":["1084-4309","1557-7309"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Design Automation of Electronic Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":13,"referenced_works":["https://openalex.org/W1922926746","https://openalex.org/W2194775991","https://openalex.org/W2895663752","https://openalex.org/W3014641072","https://openalex.org/W3040646053","https://openalex.org/W3127384284","https://openalex.org/W3186745756","https://openalex.org/W3188917597","https://openalex.org/W3214035171","https://openalex.org/W4293214219","https://openalex.org/W4312933868","https://openalex.org/W4367043644","https://openalex.org/W4386607611"],"related_works":[],"abstract_inverted_index":{"The":[0],"acceleration":[1],"of":[2,16,88,138,165,186],"inference":[3,42,107,150,193],"process":[4],"for":[5,154],"deep":[6,27],"learning":[7,28],"models":[8,182],"is":[9],"closely":[10],"tied":[11],"with":[12,128,183,191],"the":[13,21,85,106,111,134,163],"parallelization":[14],"capability":[15],"computational":[17,112],"graph":[18,113],"operators":[19],"and":[20,47,56,63,69,136,148,169,188],"parallel":[22,101],"scheduling":[23],"strategies.":[24],"Most":[25],"existing":[26],"compilers":[29],"focus":[30],"on":[31,124,167,177],"optimizing":[32],"intra-operator":[33],"parallelism,":[34,187],"while":[35],"neglecting":[36],"inter-operator":[37],"parallelism.":[38],"Furthermore,":[39],"most":[40],"industrial":[41,192],"engines,":[43],"such":[44],"as":[45],"PyTorch":[46],"TensorFlow,":[48],"utilize":[49],"a":[50,66,75,98,121,125,144],"dataflow-based":[51],"model":[52,139,146],"to":[53,72,82,132,204],"describe":[54],"tasks":[55],"schedule":[57],"operators.":[58],"They":[59],"are":[60,70],"computationally":[61],"expensive":[62],"operate":[64],"in":[65],"topological":[67],"order":[68],"parallelized":[71],"run":[73],"within":[74],"single":[76,126],"CUDA":[77,90,130],"stream.":[78],"However,":[79],"they":[80],"fail":[81],"fully":[83],"exploit":[84],"parallelism":[86,135],"capabilities":[87],"multiple":[89,115,129],"streams.":[91],"In":[92],"this":[93],"article,":[94],"we":[95],"propose":[96],"PPD,":[97],"portable,":[99],"highly":[100],"dispatching":[102,122],"system.":[103],"It":[104],"boosts":[105],"performance":[108,137],"by":[109,202],"dividing":[110],"into":[114,157],"taskflow-based":[116],"subgraphs.":[117],"Additionally,":[118],"PPD":[119,141,166,198],"entails":[120],"algorithm":[123],"GPU":[127],"streams":[131],"enhance":[133],"inference.":[140],"offers":[142],"users":[143],"lightweight":[145],"definition":[147],"an":[149],"C++":[151],"interface,":[152],"allowing":[153],"seamless":[155],"integration":[156],"any":[158],"context.":[159],"We":[160,173],"also":[161],"verify":[162],"feasibility":[164],"AMD":[168],"other":[170],"graphics":[171],"cards.":[172],"validate":[174],"our":[175],"approach":[176],"widely":[178],"adopted":[179],"neural":[180],"network":[181],"varying":[184],"degrees":[185],"compare":[189],"it":[190],"engines.":[194],"Experiments":[195],"demonstrate":[196],"that":[197],"outperforms":[199],"SOTA":[200],"methods":[201],"up":[203],"2.28\u00d7":[205],".":[206]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-28T00:00:00"}
