{"id":"https://openalex.org/W7116436991","doi":"https://doi.org/10.1145/3754598.3754638","title":"Accelerating Multi-Output GBDTs with GPUs","display_name":"Accelerating Multi-Output GBDTs with GPUs","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116436991","doi":"https://doi.org/10.1145/3754598.3754638"},"language":"en","primary_location":{"id":"doi:10.1145/3754598.3754638","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754638","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754638","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120897781","display_name":"Hanfeng Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hanfeng Liu","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-4238-6629","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004379782","display_name":"Xuemei Peng","orcid":"https://orcid.org/0000-0002-2705-2877"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xuemei Peng","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-2705-2877","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5112606091","display_name":"Zeyi Wen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zeyi Wen","raw_affiliation_strings":["The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-3370-6053","affiliations":[{"raw_affiliation_string":"The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5120897781"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.79494027,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"720","last_page":"729"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.6543999910354614,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.6543999910354614,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.06849999725818634,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.04039999842643738,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.875},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7835999727249146},{"id":"https://openalex.org/keywords/ranging","display_name":"Ranging","score":0.5738000273704529},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5598000288009644},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4724999964237213},{"id":"https://openalex.org/keywords/histogram","display_name":"Histogram","score":0.4171999990940094},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.35920000076293945},{"id":"https://openalex.org/keywords/performance-improvement","display_name":"Performance improvement","score":0.3555999994277954},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.33809998631477356}],"concepts":[{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.875},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8690999746322632},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7835999727249146},{"id":"https://openalex.org/C115051666","wikidata":"https://www.wikidata.org/wiki/Q6522493","display_name":"Ranging","level":2,"score":0.5738000273704529},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5598000288009644},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5249000191688538},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4724999964237213},{"id":"https://openalex.org/C53533937","wikidata":"https://www.wikidata.org/wiki/Q185020","display_name":"Histogram","level":3,"score":0.4171999990940094},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.362199991941452},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.35920000076293945},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.3555999994277954},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.33809998631477356},{"id":"https://openalex.org/C84525736","wikidata":"https://www.wikidata.org/wiki/Q831366","display_name":"Decision tree","level":2,"score":0.32269999384880066},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.30000001192092896},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2912999987602234},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28790000081062317},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.27489998936653137},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.2711000144481659},{"id":"https://openalex.org/C2781039887","wikidata":"https://www.wikidata.org/wiki/Q1391724","display_name":"Factor (programming language)","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2653999924659729},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2651999890804291},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26030001044273376},{"id":"https://openalex.org/C2777115002","wikidata":"https://www.wikidata.org/wiki/Q7168246","display_name":"Performance prediction","level":2,"score":0.26030001044273376},{"id":"https://openalex.org/C19012869","wikidata":"https://www.wikidata.org/wiki/Q578372","display_name":"Response time","level":2,"score":0.2565000057220459},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.25529998540878296},{"id":"https://openalex.org/C2777138346","wikidata":"https://www.wikidata.org/wiki/Q1714153","display_name":"Performance tuning","level":2,"score":0.2522999942302704}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3754598.3754638","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754638","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-169092","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-169092","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754638","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754638","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2295598076","https://openalex.org/W2799138232","https://openalex.org/W2886958107","https://openalex.org/W2920868908","https://openalex.org/W3000038587","https://openalex.org/W3047647423","https://openalex.org/W4290944299","https://openalex.org/W4327951982","https://openalex.org/W4360832001","https://openalex.org/W4372266779","https://openalex.org/W4385800960","https://openalex.org/W4388816926","https://openalex.org/W4391967369","https://openalex.org/W4393153051","https://openalex.org/W4399585293","https://openalex.org/W4401856676","https://openalex.org/W4401863505","https://openalex.org/W4408124761","https://openalex.org/W4408328303","https://openalex.org/W4409051644","https://openalex.org/W4409367393","https://openalex.org/W4415797747"],"related_works":[],"abstract_inverted_index":{"Gradient":[0],"Boosted":[1],"Decision":[2],"Trees":[3],"(GBDTs)":[4],"have":[5],"demonstrated":[6],"good":[7],"performance":[8,147,194],"in":[9],"many":[10],"data":[11],"science":[12],"competitions.":[13],"This":[14],"paper":[15],"studies":[16],"multidimensional":[17,31],"output":[18,37,55],"GBDT":[19,56],"(GBDT-MO)":[20],"training":[21,40,60,76,82,123],"which":[22,58],"can":[23],"handle":[24],"complex":[25],"dependencies":[26],"between":[27],"input":[28],"features":[29,130],"and":[30,42,114,133],"outputs.":[32],"Due":[33],"to":[34,78,93,169,187],"the":[35,39,53,81,91,110,146,176,197],"multiple":[36],"dimensionality,":[38],"time":[41],"memory":[43],"consumption":[44],"of":[45,52,109,148],"GBDT-MO":[46,61,75,116,151],"are":[47],"significantly":[48],"higher":[49],"than":[50],"those":[51],"single-dimensional":[54],"training,":[57,111],"makes":[59],"challenging,":[62],"especially":[63,98],"for":[64,141],"large-scale":[65],"datasets.":[66,143,155],"In":[67],"this":[68],"paper,":[69],"we":[70],"propose":[71],"a":[72,125,163,181],"novel":[73],"GPU-accelerated":[74],"system":[77,89,120,152,161,174],"speed":[79],"up":[80],"while":[83,190],"maintaining":[84,191],"competitive":[85],"model":[86],"quality.":[87],"Our":[88],"leverages":[90],"GPU":[92],"efficiently":[94],"construct":[95],"decision":[96],"trees,":[97],"by":[99,128,180],"dynamically":[100],"choosing":[101],"efficient":[102,115],"histogram":[103],"building":[104],"methods":[105],"at":[106],"different":[107],"stages":[108],"enabling":[112],"scalable":[113],"training.":[117],"Furthermore,":[118],"our":[119,149,160,173],"supports":[121],"multi-GPU":[122],"on":[124,153],"single":[126],"machine":[127],"partitioning":[129],"across":[131],"GPUs":[132],"using":[134],"communication-efficient":[135],"synchronization":[136],"strategies,":[137],"allowing":[138],"further":[139],"scalability":[140],"high-dimensional":[142],"We":[144],"evaluate":[145],"proposed":[150],"several":[154],"Compared":[156],"with":[157,196],"CPU-based":[158],"implementations,":[159],"achieves":[162],"speedup":[164,182],"ranging":[165,183],"from":[166,184],"30":[167],"\u00d7":[168,186],"190":[170],"\u00d7.":[171],"Moreover,":[172],"outperforms":[175],"state-of-the-art":[177,198],"GPU-based":[178],"baselines":[179],"1.7":[185],"170":[188],"\u00d7,":[189],"strong":[192],"predictive":[193],"compared":[195],"methods.":[199]},"counts_by_year":[],"updated_date":"2025-12-21T02:06:08.432651","created_date":"2025-12-21T00:00:00"}
