{"id":"https://openalex.org/W7140302920","doi":"https://doi.org/10.48550/arxiv.2603.22535","title":"SCALE-Sim TPU: Validating and Extending SCALE-Sim for TPUs","display_name":"SCALE-Sim TPU: Validating and Extending SCALE-Sim for TPUs","publication_year":2026,"publication_date":"2026-03-23","ids":{"openalex":"https://openalex.org/W7140302920","doi":"https://doi.org/10.48550/arxiv.2603.22535"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.22535","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22535","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.22535","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5111163378","display_name":"Jingtian Dang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Dang, Jingtian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130546524","display_name":"Ritik Raj","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Raj, Ritik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104121412","display_name":"Changhai Man","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Man, Changhai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012752485","display_name":"Jianming Tong","orcid":"https://orcid.org/0000-0001-8436-2946"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tong, Jianming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5124285599","display_name":"Tushar Krishna","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Krishna, Tushar","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5111163378"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.788100004196167,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.788100004196167,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.05350000038743019,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11195","display_name":"Simulation Techniques and Applications","score":0.024399999529123306,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.8230999708175659},{"id":"https://openalex.org/keywords/usability","display_name":"Usability","score":0.7257999777793884},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.6248999834060669},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.42010000348091125},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.31299999356269836}],"concepts":[{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.8230999708175659},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8083999752998352},{"id":"https://openalex.org/C170130773","wikidata":"https://www.wikidata.org/wiki/Q216378","display_name":"Usability","level":2,"score":0.7257999777793884},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.6248999834060669},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.42010000348091125},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4049000144004822},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3833000063896179},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3644999861717224},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3433000147342682},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.3276999890804291},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.31299999356269836},{"id":"https://openalex.org/C117220453","wikidata":"https://www.wikidata.org/wiki/Q5172842","display_name":"Correlation","level":2,"score":0.3010999858379364},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.28369998931884766},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.28139999508857727},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2793000042438507}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.22535","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22535","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.22535","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.22535","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","score":0.6267076730728149,"display_name":"No poverty"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Cycle-accurate":[0],"simulators":[1],"are":[2,14],"widely":[3],"used":[4],"to":[5,131],"study":[6],"systolic":[7,56],"accelerators,":[8],"yet":[9],"their":[10],"accuracy":[11],"and":[12,23,38,65,106,129,147],"usability":[13],"often":[15],"limited":[16],"by":[17],"weak":[18],"validation":[19],"against":[20,59],"real":[21],"hardware":[22,77],"poor":[24],"integration":[25],"with":[26,76],"modern":[27,123],"ML":[28,124],"compiler":[29,138],"stacks.":[30],"This":[31],"paper":[32],"presents":[33],"SCALE-Sim":[34,42],"TPU,":[35],"a":[36,72,80,116,136],"validated":[37],"extended":[39],"version":[40],"of":[41,149],"v3":[43],"for":[44,91,152],"TPU-style":[45],"accelerators.":[46],"Specifically,":[47],"we":[48],"make":[49],"three":[50],"contributions:":[51],"(1)":[52],"We":[53,85,114],"validate":[54],"SCALE-Sim's":[55],"GEMM":[57],"model":[58],"measurements":[60],"on":[61,156],"Google":[62],"TPU":[63],"v4":[64],"show":[66],"that":[67,119],"simulated":[68,133],"cycle":[69],"counts":[70],"exhibit":[71],"strong":[73],"linear":[74],"correlation":[75],"latency,":[78],"enabling":[79],"simple":[81],"cycle-to-latency":[82],"mapping.":[83],"(2)":[84],"introduce":[86],"lightweight":[87],"learned":[88],"latency":[89,111],"models":[90],"non-systolic":[92],"elementwise":[93],"operations,":[94],"achieving":[95],"median":[96],"relative":[97],"errors":[98],"below":[99],"3":[100],"percent":[101],"using":[102],"only":[103],"tensor":[104],"size":[105],"shape,":[107],"substantially":[108],"improving":[109],"end-to-end":[110],"estimation.":[112],"(3)":[113],"integrate":[115],"StableHLO-based":[117],"frontend":[118],"allows":[120],"workloads":[121],"from":[122],"frameworks":[125],"such":[126],"as":[127],"JAX":[128],"PyTorch":[130],"be":[132],"directly":[134],"via":[135],"unified":[137],"IR.":[139],"Together,":[140],"these":[141],"contributions":[142],"improve":[143],"the":[144],"fidelity,":[145],"coverage,":[146],"practicality":[148],"cycle-accurate":[150],"simulation":[151],"whole-model":[153],"performance":[154],"analysis":[155],"TPUs.":[157]},"counts_by_year":[],"updated_date":"2026-03-26T06:10:45.909354","created_date":"2026-03-26T00:00:00"}
