{"id":"https://openalex.org/W4412875493","doi":"https://doi.org/10.1145/3711896.3736949","title":"Exploiting Student Parallelism for Low-latency GPU Inference of BERT-like Models in Online Services","display_name":"Exploiting Student Parallelism for Low-latency GPU Inference of BERT-like Models in Online Services","publication_year":2025,"publication_date":"2025-08-03","ids":{"openalex":"https://openalex.org/W4412875493","doi":"https://doi.org/10.1145/3711896.3736949"},"language":"en","primary_location":{"id":"doi:10.1145/3711896.3736949","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3711896.3736949","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013332473","display_name":"Weiyan Wang","orcid":"https://orcid.org/0000-0002-4105-0691"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]},{"id":"https://openalex.org/I889458895","display_name":"University of Hong Kong","ror":"https://ror.org/02zhqgq86","country_code":"HK","type":"education","lineage":["https://openalex.org/I889458895"]}],"countries":["CN","HK"],"is_corresponding":true,"raw_author_name":"Weiyan Wang","raw_affiliation_strings":["Tencent, Beijing, China and Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Tencent, Beijing, China and Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I2250653659","https://openalex.org/I889458895"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087553273","display_name":"Yilun Jin","orcid":"https://orcid.org/0000-0002-9502-7622"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yilun Jin","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100395351","display_name":"Yiming Zhang","orcid":"https://orcid.org/0000-0001-6450-8485"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yiming Zhang","raw_affiliation_strings":["Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041900541","display_name":"Victor Junqiu Wei","orcid":"https://orcid.org/0000-0001-5548-7301"},"institutions":[{"id":"https://openalex.org/I111950717","display_name":"Macau University of Science and Technology","ror":"https://ror.org/03jqs2n27","country_code":"MO","type":"education","lineage":["https://openalex.org/I111950717","https://openalex.org/I4391767947"]}],"countries":["MO"],"is_corresponding":false,"raw_author_name":"Victor Junqiu Wei","raw_affiliation_strings":["Macau University of Science and Technology, Macau SAR, China"],"affiliations":[{"raw_affiliation_string":"Macau University of Science and Technology, Macau SAR, China","institution_ids":["https://openalex.org/I111950717"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057313730","display_name":"Han Tian","orcid":"https://orcid.org/0000-0002-3238-8500"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Han Tian","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100379245","display_name":"Li Chen","orcid":"https://orcid.org/0000-0002-4228-7885"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li Chen","raw_affiliation_strings":["Zhongguancun Laboratory, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Zhongguancun Laboratory, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025328651","display_name":"Jinbao Xue","orcid":"https://orcid.org/0009-0003-4087-9873"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinbao Xue","raw_affiliation_strings":["Tencent, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tencent, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077922409","display_name":"Yangyu Tao","orcid":"https://orcid.org/0009-0003-0536-4321"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yangyu Tao","raw_affiliation_strings":["Tencent, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tencent, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106699401","display_name":"Di Wang","orcid":"https://orcid.org/0000-0002-6047-342X"},"institutions":[{"id":"https://openalex.org/I2250653659","display_name":"Tencent (China)","ror":"https://ror.org/00hhjss72","country_code":"CN","type":"company","lineage":["https://openalex.org/I2250653659"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Di Wang","raw_affiliation_strings":["Tencent, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Tencent, Beijing, China","institution_ids":["https://openalex.org/I2250653659"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100438001","display_name":"Kai Chen","orcid":"https://orcid.org/0000-0003-2587-6028"},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Kai Chen","raw_affiliation_strings":["Hong Kong University of Science and Technology, Hong Kong SAR, China"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology, Hong Kong SAR, China","institution_ids":["https://openalex.org/I200769079"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":10,"corresponding_author_ids":["https://openalex.org/A5013332473"],"corresponding_institution_ids":["https://openalex.org/I2250653659","https://openalex.org/I889458895"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.21882187,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"3055","last_page":"3066"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9934999942779541,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9916999936103821,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8156755566596985},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6642726063728333},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6543142795562744},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6461201310157776},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.5778750777244568},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.4702141582965851},{"id":"https://openalex.org/keywords/general-purpose-computing-on-graphics-processing-units","display_name":"General-purpose computing on graphics processing units","score":0.42645013332366943},{"id":"https://openalex.org/keywords/low-latency","display_name":"Low latency (capital markets)","score":0.4160221219062805},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.3285086154937744},{"id":"https://openalex.org/keywords/computer-graphics","display_name":"Computer graphics (images)","score":0.24392253160476685},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2174016237258911},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.1212669312953949},{"id":"https://openalex.org/keywords/graphics","display_name":"Graphics","score":0.09008744359016418},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.07070064544677734}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8156755566596985},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6642726063728333},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6543142795562744},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6461201310157776},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.5778750777244568},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.4702141582965851},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.42645013332366943},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.4160221219062805},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3285086154937744},{"id":"https://openalex.org/C121684516","wikidata":"https://www.wikidata.org/wiki/Q7600677","display_name":"Computer graphics (images)","level":1,"score":0.24392253160476685},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2174016237258911},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.1212669312953949},{"id":"https://openalex.org/C21442007","wikidata":"https://www.wikidata.org/wiki/Q1027879","display_name":"Graphics","level":2,"score":0.09008744359016418},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.07070064544677734}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3711896.3736949","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3711896.3736949","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2","raw_type":"proceedings-article"},{"id":"pmh:oai:repository.hkust.edu.hk:1783.1-166350","is_oa":false,"landing_page_url":"http://repository.hkust.edu.hk/ir/Record/1783.1-166350","pdf_url":null,"source":{"id":"https://openalex.org/S4306401796","display_name":"Rare & Special e-Zone (The Hong Kong University of Science and Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I200769079","host_organization_name":"Hong Kong University of Science and Technology","host_organization_lineage":["https://openalex.org/I200769079"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference paper"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":42,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1570060426","https://openalex.org/W2492794003","https://openalex.org/W2517617279","https://openalex.org/W2541674938","https://openalex.org/W2565600385","https://openalex.org/W2581624817","https://openalex.org/W2626373159","https://openalex.org/W2734941459","https://openalex.org/W2747329762","https://openalex.org/W2807912816","https://openalex.org/W2899771611","https://openalex.org/W2911181069","https://openalex.org/W2956461999","https://openalex.org/W2990138404","https://openalex.org/W2996834012","https://openalex.org/W2997710335","https://openalex.org/W2998183051","https://openalex.org/W3004334800","https://openalex.org/W3027879771","https://openalex.org/W3033187248","https://openalex.org/W3033737024","https://openalex.org/W3035010485","https://openalex.org/W3035030897","https://openalex.org/W3035038672","https://openalex.org/W3043571714","https://openalex.org/W3091441391","https://openalex.org/W3101163004","https://openalex.org/W3130689885","https://openalex.org/W3167266074","https://openalex.org/W3177265267","https://openalex.org/W3180037928","https://openalex.org/W3206343543","https://openalex.org/W4212946524","https://openalex.org/W4224296706","https://openalex.org/W4294831549","https://openalex.org/W4321471854","https://openalex.org/W4387321091","https://openalex.org/W6676249281","https://openalex.org/W6778883912","https://openalex.org/W6788001715","https://openalex.org/W6798686915"],"related_works":["https://openalex.org/W2950520577","https://openalex.org/W1501159154","https://openalex.org/W1554644772","https://openalex.org/W2003935582","https://openalex.org/W2494130044","https://openalex.org/W3170887803","https://openalex.org/W74409296","https://openalex.org/W3209384898","https://openalex.org/W4400951174","https://openalex.org/W1595834484"],"abstract_inverted_index":{"BERT-like":[0,19,76],"models":[1,20,77,107],"have":[2],"been":[3],"widely":[4],"adopted":[5],"in":[6,109],"text":[7],"mining":[8],"and":[9,50,61,90,124,177,192],"web":[10],"search":[11],"due":[12,58],"to":[13,55,59,115,139,146,168,185,194],"their":[14,33],"high":[15,34],"accuracy.":[16,131],"However,":[17],"large":[18,38],"suffer":[21],"from":[22],"inefficient":[23],"online":[24,52,80,149],"inference":[25,74,127],"on":[26,37,46,174],"GPUs":[27],"for":[28,71],"two":[29,122],"main":[30],"reasons.":[31],"First,":[32],"accuracy":[35,166,191],"relies":[36],"model":[39,97,119],"depth,":[40],"which":[41],"linearly":[42],"increases":[43],"sequential":[44],"computation":[45],"GPUs.":[47],"Second,":[48],"stochastic":[49],"dynamic":[51,148],"workloads":[53,178],"lead":[54],"extra":[56],"costs":[57],"batching":[60],"padding.":[62],"To":[63],"address":[64],"the":[65,94,141,147,160],"problem,":[66],"we":[67,134],"present":[68],"Student":[69,85,113,181],"Parallelism":[70,86,114,182],"efficient":[72],"GPU":[73],"of":[75,101,143,162],"under":[78],"real-world":[79,175],"workloads.":[81,150],"At":[82],"its":[83],"core,":[84],"adopts":[87],"stacking":[88],"distillation":[89],"boosting":[91],"ensemble,":[92],"distilling":[93],"original":[95],"deep":[96],"into":[98],"a":[99,117],"group":[100],"shallow":[102],"but":[103],"virtually":[104],"stacked":[105],"student":[106,137],"running":[108],"parallel.":[110],"This":[111],"enables":[112],"achieve":[116],"low":[118,126],"depth":[120],"(e.g.,":[121],"layers),":[123],"thus":[125],"latency":[128,188],"while":[129,189],"maintaining":[130,190],"In":[132],"addition,":[133],"design":[135],"adaptive":[136],"pruning":[138],"adjust":[140],"number":[142,161],"students":[144,163],"according":[145],"For":[151],"example,":[152],"during":[153,198],"workload":[154,199],"bursts,":[155],"it":[156],"can":[157],"temporarily":[158],"decrease":[159],"with":[164],"minimal":[165],"loss":[167],"improve":[169],"system":[170],"throughput.":[171],"Extensive":[172],"experiments":[173],"datasets":[176],"show":[179],"that":[180],"achieves":[183],"up":[184,193],"4.1\u00d7":[186],"lower":[187],"22.27\u00d7":[195],"higher":[196],"throughput":[197],"bursts.":[200]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
