{"id":"https://openalex.org/W7084027034","doi":"https://doi.org/10.1109/infocomwkshps65812.2025.11152775","title":"Demo: An End-to-End Assurance Framework for AI/ML Workloads in Datacenters","display_name":"Demo: An End-to-End Assurance Framework for AI/ML Workloads in Datacenters","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W7084027034","doi":"https://doi.org/10.1109/infocomwkshps65812.2025.11152775"},"language":"en","primary_location":{"id":"doi:10.1109/infocomwkshps65812.2025.11152775","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocomwkshps65812.2025.11152775","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jit Gupta","orcid":null},"institutions":[{"id":"https://openalex.org/I1339145263","display_name":"Juniper Networks (United States)","ror":"https://ror.org/02pwct569","country_code":"US","type":"company","lineage":["https://openalex.org/I1339145263"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Jit Gupta","raw_affiliation_strings":["Juniper Networks,Sunnyvale,CA,USA,94089"],"affiliations":[{"raw_affiliation_string":"Juniper Networks,Sunnyvale,CA,USA,94089","institution_ids":["https://openalex.org/I1339145263"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tarun Banka","orcid":null},"institutions":[{"id":"https://openalex.org/I1339145263","display_name":"Juniper Networks (United States)","ror":"https://ror.org/02pwct569","country_code":"US","type":"company","lineage":["https://openalex.org/I1339145263"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tarun Banka","raw_affiliation_strings":["Juniper Networks,Sunnyvale,CA,USA,94089"],"affiliations":[{"raw_affiliation_string":"Juniper Networks,Sunnyvale,CA,USA,94089","institution_ids":["https://openalex.org/I1339145263"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Rahul Gupta","orcid":null},"institutions":[{"id":"https://openalex.org/I1339145263","display_name":"Juniper Networks (United States)","ror":"https://ror.org/02pwct569","country_code":"US","type":"company","lineage":["https://openalex.org/I1339145263"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rahul Gupta","raw_affiliation_strings":["Juniper Networks,Sunnyvale,CA,USA,94089"],"affiliations":[{"raw_affiliation_string":"Juniper Networks,Sunnyvale,CA,USA,94089","institution_ids":["https://openalex.org/I1339145263"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Mithun Dharmaraj","orcid":null},"institutions":[{"id":"https://openalex.org/I1339145263","display_name":"Juniper Networks (United States)","ror":"https://ror.org/02pwct569","country_code":"US","type":"company","lineage":["https://openalex.org/I1339145263"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mithun Dharmaraj","raw_affiliation_strings":["Juniper Networks,Sunnyvale,CA,USA,94089"],"affiliations":[{"raw_affiliation_string":"Juniper Networks,Sunnyvale,CA,USA,94089","institution_ids":["https://openalex.org/I1339145263"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jasleen Kaur","orcid":null},"institutions":[{"id":"https://openalex.org/I1339145263","display_name":"Juniper Networks (United States)","ror":"https://ror.org/02pwct569","country_code":"US","type":"company","lineage":["https://openalex.org/I1339145263"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jasleen Kaur","raw_affiliation_strings":["Juniper Networks,Sunnyvale,CA,USA,94089"],"affiliations":[{"raw_affiliation_string":"Juniper Networks,Sunnyvale,CA,USA,94089","institution_ids":["https://openalex.org/I1339145263"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I1339145263"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.58109732,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"2"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12364","display_name":"Archaeological Research and Protection","score":0.13570000231266022,"subfield":{"id":"https://openalex.org/subfields/1912","display_name":"Space and Planetary Science"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12364","display_name":"Archaeological Research and Protection","score":0.13570000231266022,"subfield":{"id":"https://openalex.org/subfields/1912","display_name":"Space and Planetary Science"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11164","display_name":"Remote Sensing and LiDAR Applications","score":0.08129999786615372,"subfield":{"id":"https://openalex.org/subfields/2305","display_name":"Environmental Engineering"},"field":{"id":"https://openalex.org/fields/23","display_name":"Environmental Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12157","display_name":"Geochemistry and Geologic Mapping","score":0.07429999858140945,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/troubleshooting","display_name":"Troubleshooting","score":0.8253999948501587},{"id":"https://openalex.org/keywords/workload","display_name":"Workload","score":0.6122000217437744},{"id":"https://openalex.org/keywords/artifact","display_name":"Artifact (error)","score":0.5224999785423279},{"id":"https://openalex.org/keywords/dependency","display_name":"Dependency (UML)","score":0.4339999854564667},{"id":"https://openalex.org/keywords/root-cause","display_name":"Root cause","score":0.4327999949455261},{"id":"https://openalex.org/keywords/interrupt","display_name":"Interrupt","score":0.4081999957561493},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.38269999623298645},{"id":"https://openalex.org/keywords/abstraction","display_name":"Abstraction","score":0.3799000084400177},{"id":"https://openalex.org/keywords/root-cause-analysis","display_name":"Root cause analysis","score":0.37619999051094055},{"id":"https://openalex.org/keywords/matching","display_name":"Matching (statistics)","score":0.3736000061035156}],"concepts":[{"id":"https://openalex.org/C147494362","wikidata":"https://www.wikidata.org/wiki/Q2078905","display_name":"Troubleshooting","level":2,"score":0.8253999948501587},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8014000058174133},{"id":"https://openalex.org/C2778476105","wikidata":"https://www.wikidata.org/wiki/Q628539","display_name":"Workload","level":2,"score":0.6122000217437744},{"id":"https://openalex.org/C2779010991","wikidata":"https://www.wikidata.org/wiki/Q2720909","display_name":"Artifact (error)","level":2,"score":0.5224999785423279},{"id":"https://openalex.org/C19768560","wikidata":"https://www.wikidata.org/wiki/Q320727","display_name":"Dependency (UML)","level":2,"score":0.4339999854564667},{"id":"https://openalex.org/C84945661","wikidata":"https://www.wikidata.org/wiki/Q7366567","display_name":"Root cause","level":2,"score":0.4327999949455261},{"id":"https://openalex.org/C41661131","wikidata":"https://www.wikidata.org/wiki/Q220764","display_name":"Interrupt","level":3,"score":0.4081999957561493},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.40700000524520874},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.38269999623298645},{"id":"https://openalex.org/C124304363","wikidata":"https://www.wikidata.org/wiki/Q673661","display_name":"Abstraction","level":2,"score":0.3799000084400177},{"id":"https://openalex.org/C130963320","wikidata":"https://www.wikidata.org/wiki/Q1401207","display_name":"Root cause analysis","level":2,"score":0.37619999051094055},{"id":"https://openalex.org/C165064840","wikidata":"https://www.wikidata.org/wiki/Q1321061","display_name":"Matching (statistics)","level":2,"score":0.3736000061035156},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3684999942779541},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.36340001225471497},{"id":"https://openalex.org/C36299963","wikidata":"https://www.wikidata.org/wiki/Q1369844","display_name":"Observability","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C2776505523","wikidata":"https://www.wikidata.org/wiki/Q4785468","display_name":"Plan (archaeology)","level":2,"score":0.3522000014781952},{"id":"https://openalex.org/C67069471","wikidata":"https://www.wikidata.org/wiki/Q1140205","display_name":"IT service continuity","level":2,"score":0.34529998898506165},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.3416999876499176},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.33820000290870667},{"id":"https://openalex.org/C193519340","wikidata":"https://www.wikidata.org/wiki/Q891179","display_name":"Data loss","level":2,"score":0.32170000672340393},{"id":"https://openalex.org/C2777615720","wikidata":"https://www.wikidata.org/wiki/Q11888847","display_name":"Prioritization","level":2,"score":0.31690001487731934},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.29319998621940613},{"id":"https://openalex.org/C116834253","wikidata":"https://www.wikidata.org/wiki/Q2039217","display_name":"Identification (biology)","level":2,"score":0.289900004863739},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.2831000089645386},{"id":"https://openalex.org/C111852164","wikidata":"https://www.wikidata.org/wiki/Q1414679","display_name":"Handover","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.2685000002384186},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.26589998602867126},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C174839445","wikidata":"https://www.wikidata.org/wiki/Q1134386","display_name":"Lock (firearm)","level":2,"score":0.2556000053882599},{"id":"https://openalex.org/C141571065","wikidata":"https://www.wikidata.org/wiki/Q1771949","display_name":"Performance measurement","level":2,"score":0.2549999952316284}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/infocomwkshps65812.2025.11152775","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocomwkshps65812.2025.11152775","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications Workshops (INFOCOM WKSHPS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":3,"referenced_works":["https://openalex.org/W2498764059","https://openalex.org/W3012125688","https://openalex.org/W4362566357"],"related_works":[],"abstract_inverted_index":{"Modern":[0],"machine":[1],"learning":[2],"workloads":[3,29],"such":[4,106],"as":[5,107],"large":[6],"language":[7],"model":[8],"training,":[9],"fine-tuning":[10],"jobs":[11],"are":[12,101],"highly":[13],"distributed":[14],"and":[15,38,56,70,81],"span":[16],"across":[17],"hundreds":[18],"of":[19,33,43],"systems":[20],"with":[21],"multiple":[22],"GPUs.":[23],"Job":[24],"completion":[25],"time":[26],"for":[27,59,62,73,103],"these":[28],"is":[30,49],"the":[31,34,53,60],"artifact":[32],"application,":[35],"compute,":[36],"network":[37],"storage":[39],"performance.":[40],"In":[41],"case":[42],"failure":[44],"or":[45],"degraded":[46],"performance":[47,76],"it":[48],"imperative":[50],"to":[51],"understand":[52],"root":[54],"cause":[55],"possible":[57],"remediation":[58],"problem":[61],"end-to-end":[63,104],"assurance.":[64],"This":[65],"demo":[66],"showcases":[67],"SaaS-based":[68],"observability":[69],"automated":[71],"troubleshooting":[72],"AI/ML":[74],"workload":[75],"issues":[77],"using":[78],"cross-layer":[79],"telemetry":[80],"logs":[82],"(e.g.,":[83],"Application":[84],"telemetry,":[85],"Collective":[86],"communication":[87],"logs,":[88],"GPU":[89],"Health":[90],"metrics,":[91],"Network":[92],"Flow":[93],"Data,":[94],"NIC":[95],"ROCEv2":[96],"telemetry).":[97],"Different":[98],"use":[99],"cases":[100],"demonstrated":[102],"assurance":[105],"Cross-layer":[108,111],"Dependency":[109],"Graph,":[110],"Service":[112],"Level":[113],"Expectations,":[114],"Automated":[115],"Root":[116],"Cause":[117],"Analysis,":[118],"GPU-to-GPU":[119],"application":[120],"path":[121],"tracing.":[122]},"counts_by_year":[],"updated_date":"2025-12-22T23:10:17.713674","created_date":"2025-10-10T00:00:00"}
