{"id":"https://openalex.org/W7134889149","doi":"https://doi.org/10.1145/3779212.3790163","title":"Fine-grained and Non-intrusive LLM Training Monitoring via Microsecond-level Traffic Measurement","display_name":"Fine-grained and Non-intrusive LLM Training Monitoring via Microsecond-level Traffic Measurement","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7134889149","doi":"https://doi.org/10.1145/3779212.3790163"},"language":null,"primary_location":{"id":"doi:10.1145/3779212.3790163","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790163","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3779212.3790163","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5102594988","display_name":"Yibo Xiao","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yibo Xiao","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128714682","display_name":"Hao Zheng","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Zheng","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120986827","display_name":"Haifeng Sun","orcid":null},"institutions":[{"id":"https://openalex.org/I165932596","display_name":"National University of Singapore","ror":"https://ror.org/01tgyzw49","country_code":"SG","type":"education","lineage":["https://openalex.org/I165932596"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Haifeng Sun","raw_affiliation_strings":["National University of Singapore, Singapore, Singapore"],"affiliations":[{"raw_affiliation_string":"National University of Singapore, Singapore, Singapore","institution_ids":["https://openalex.org/I165932596"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068082565","display_name":"Qingkai Meng","orcid":"https://orcid.org/0000-0001-5394-8450"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qingkai Meng","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128705077","display_name":"Jiong Duan","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiong Duan","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101039643","display_name":"Xiaohe Hu","orcid":"https://orcid.org/0000-0003-1487-2419"},"institutions":[{"id":"https://openalex.org/I1324360087","display_name":"Infinera (United States)","ror":"https://ror.org/02ty5cy83","country_code":"US","type":"company","lineage":["https://openalex.org/I1324360087"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Xiaohe Hu","raw_affiliation_strings":["Infrawaves, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Infrawaves, Beijing, China","institution_ids":["https://openalex.org/I1324360087"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128708308","display_name":"Rong Gu","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rong Gu","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123241526","display_name":"Guihai Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Guihai Chen","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5049696912","display_name":"Chen Tian","orcid":"https://orcid.org/0000-0002-3388-9024"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chen Tian","raw_affiliation_strings":["State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China"],"affiliations":[{"raw_affiliation_string":"State Key Laboratory of Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5102594988"],"corresponding_institution_ids":["https://openalex.org/I881766915"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.85622703,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"764","last_page":"782"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.1120000034570694,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12127","display_name":"Software System Performance and Reliability","score":0.1120000034570694,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11598","display_name":"Internet Traffic Analysis and Secure E-voting","score":0.10859999805688858,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.05990000069141388,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/testbed","display_name":"Testbed","score":0.7067999839782715},{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.6714000105857849},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5694000124931335},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4959999918937683},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4458000063896179},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.4251999855041504},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4113999903202057},{"id":"https://openalex.org/keywords/operator","display_name":"Operator (biology)","score":0.36079999804496765}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7257000207901001},{"id":"https://openalex.org/C31395832","wikidata":"https://www.wikidata.org/wiki/Q1318674","display_name":"Testbed","level":2,"score":0.7067999839782715},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.6714000105857849},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.5776000022888184},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4959999918937683},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4458000063896179},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.4251999855041504},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4113999903202057},{"id":"https://openalex.org/C17020691","wikidata":"https://www.wikidata.org/wiki/Q139677","display_name":"Operator (biology)","level":5,"score":0.36079999804496765},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.35179999470710754},{"id":"https://openalex.org/C12269588","wikidata":"https://www.wikidata.org/wiki/Q132364","display_name":"Communications protocol","level":2,"score":0.305400013923645},{"id":"https://openalex.org/C2779019669","wikidata":"https://www.wikidata.org/wiki/Q25203946","display_name":"Asynchrony (computer programming)","level":3,"score":0.2978000044822693},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2955999970436096},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.28790000081062317},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.2718000113964081},{"id":"https://openalex.org/C22927095","wikidata":"https://www.wikidata.org/wiki/Q1784206","display_name":"Stateful firewall","level":3,"score":0.2676999866962433},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.2628999948501587},{"id":"https://openalex.org/C67388219","wikidata":"https://www.wikidata.org/wiki/Q207440","display_name":"Hash table","level":3,"score":0.26190000772476196},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.25780001282691956},{"id":"https://openalex.org/C124184767","wikidata":"https://www.wikidata.org/wiki/Q1591311","display_name":"Scram","level":2,"score":0.2565000057220459},{"id":"https://openalex.org/C739882","wikidata":"https://www.wikidata.org/wiki/Q3560506","display_name":"Anomaly detection","level":2,"score":0.2549000084400177}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3779212.3790163","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790163","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3779212.3790163","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790163","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":18,"referenced_works":["https://openalex.org/W1990249073","https://openalex.org/W2170715240","https://openalex.org/W2501089562","https://openalex.org/W2746568623","https://openalex.org/W2834288129","https://openalex.org/W2969388332","https://openalex.org/W3046644368","https://openalex.org/W3081168214","https://openalex.org/W3129831491","https://openalex.org/W3204998121","https://openalex.org/W4307020348","https://openalex.org/W4386348926","https://openalex.org/W4386396783","https://openalex.org/W4401175863","https://openalex.org/W4401176590","https://openalex.org/W4401176799","https://openalex.org/W4401176811","https://openalex.org/W4413756477"],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"model":[2],"(LLM)":[3],"training":[4,26,66,183,193],"is":[5,84],"prone":[6],"to":[7,10,20,29,65,85,170],"anomalies":[8,36],"due":[9],"its":[11,126],"long":[12],"duration":[13],"and":[14,62,77,100,111,124,151,165,178],"large":[15],"scale,":[16],"which":[17,56],"can":[18],"lead":[19],"significant":[21],"performance":[22],"degradation":[23],"or":[24,68],"even":[25,152],"crashes.":[27],"Due":[28],"the":[30,38,156],"synchronization":[31],"nature":[32],"of":[33,141,155],"LLM":[34,192],"training,":[35],"exhibit":[37],"cascading":[39],"effect,":[40],"making":[41,185],"their":[42],"diagnosis":[43],"challenging.":[44],"Existing":[45],"approaches":[46],"rely":[47],"on":[48,98,117,182],"collecting":[49],"communication":[50,69,105],"operator":[51,106],"information":[52],"via":[53,89],"code":[54,67],"instrumentation,":[55],"yields":[57],"only":[58,149],"coarse-grained":[59],"monitoring":[60,79,88],"data":[61],"requires":[63],"modifications":[64],"libraries.":[70],"We":[71,114],"propose":[72],"Pulse,":[73],"a":[74,118,187],"fine-grained,":[75],"non-intrusive,":[76],"easy-to-deploy":[78],"system.":[80],"Our":[81],"key":[82],"idea":[83],"enable":[86],"fine-grained":[87,110],"traffic":[90,96],"measurement.":[91],"Pulse":[92,116,134,160],"conducts":[93],"microsecond-level":[94],"RDMA":[95,173],"measurement":[97],"NICs,":[99],"transforms":[101],"flow-level":[102],"measurements":[103,175],"into":[104],"measurements,":[107],"thereby":[108],"enabling":[109],"non-intrusive":[112],"monitoring.":[113],"deploy":[115],"testbed":[119],"with":[120],"64":[121],"H200":[122],"GPUs":[123],"evaluate":[125],"anomaly":[127],"localization":[128,137],"capability":[129],"under":[130],"common":[131],"failure":[132],"scenarios.":[133,158],"achieves":[135,161],"machine-level":[136],"in":[138,148],"10":[139],"out":[140],"12":[142],"scenarios,":[143],"while":[144],"existing":[145],"methods":[146],"succeed":[147],"4":[150],"misdiagnose":[153],"2":[154],"remaining":[157],"Additionally,":[159],"over":[162],"90%":[163],"precision":[164],"100%":[166],"recall,":[167],"supports":[168],"up":[169],"2000":[171],"concurrent":[172],"flow":[174],"per":[176],"NIC,":[177],"imposes":[179],"negligible":[180],"overhead":[181],"performance,":[184],"it":[186],"practical":[188],"solution":[189],"for":[190],"real-world":[191],"environments.":[194]},"counts_by_year":[],"updated_date":"2026-03-12T06:18:43.230356","created_date":"2026-03-12T00:00:00"}
