{"id":"https://openalex.org/W7133350142","doi":"https://doi.org/10.48550/arxiv.2603.00549","title":"PM2Lat: Highly Accurate and Generalized Prediction of DNN Execution Latency on GPUs","display_name":"PM2Lat: Highly Accurate and Generalized Prediction of DNN Execution Latency on GPUs","publication_year":2026,"publication_date":"2026-02-28","ids":{"openalex":"https://openalex.org/W7133350142","doi":"https://doi.org/10.48550/arxiv.2603.00549"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.00549","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00549","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.00549","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5128023380","display_name":"Truong-Thanh Le","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Le, Truong-Thanh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082673407","display_name":"Hoang-Loc La","orcid":"https://orcid.org/0009-0005-5453-7836"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"La, Hoang-Loc","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127966735","display_name":"Amir Taherkordi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Taherkordi, Amir","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100357767","display_name":"\u0422\u0430\u043e \u0427\u0435\u043d","orcid":"https://orcid.org/0000-0002-8031-7117"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Eliassen, Frank","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127904021","display_name":"Phuong Hoai Ha and","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"and, Phuong Hoai Ha","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128013927","display_name":"Peiyuan Guan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guan, Peiyuan","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5128023380"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.689300000667572,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.689300000667572,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.10580000281333923,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.039799999445676804,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5550000071525574},{"id":"https://openalex.org/keywords/artificial-neural-network","display_name":"Artificial neural network","score":0.5378000140190125},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.48660001158714294},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.4343999922275543},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.42579999566078186},{"id":"https://openalex.org/keywords/deep-neural-networks","display_name":"Deep neural networks","score":0.4203000068664551},{"id":"https://openalex.org/keywords/focus","display_name":"Focus (optics)","score":0.3995000123977661},{"id":"https://openalex.org/keywords/mean-squared-prediction-error","display_name":"Mean squared prediction error","score":0.3659999966621399}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8493000268936157},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5550000071525574},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.5378000140190125},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5181999802589417},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.48660001158714294},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.4343999922275543},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.42579999566078186},{"id":"https://openalex.org/C2984842247","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep neural networks","level":3,"score":0.4203000068664551},{"id":"https://openalex.org/C192209626","wikidata":"https://www.wikidata.org/wiki/Q190909","display_name":"Focus (optics)","level":2,"score":0.3995000123977661},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3935000002384186},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3896999955177307},{"id":"https://openalex.org/C167085575","wikidata":"https://www.wikidata.org/wiki/Q6803654","display_name":"Mean squared prediction error","level":2,"score":0.3659999966621399},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.3398999869823456},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.335099995136261},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3296999931335449},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.32019999623298645},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C122383733","wikidata":"https://www.wikidata.org/wiki/Q865920","display_name":"Approximation error","level":2,"score":0.3073999881744385},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C123657996","wikidata":"https://www.wikidata.org/wiki/Q12271","display_name":"Architecture","level":2,"score":0.2944999933242798},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2890999913215637},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.27559998631477356},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2621000111103058},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C2780595030","wikidata":"https://www.wikidata.org/wiki/Q3860309","display_name":"Multiplication (music)","level":2,"score":0.25029999017715454}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.00549","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00549","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.00549","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.00549","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"present":[1],"PM2Lat,":[2],"a":[3],"fast":[4],"and":[5,63,101,120,141,159,173],"generalized":[6],"framework":[7],"for":[8,171,178],"accurately":[9],"predicting":[10],"the":[11,39,84,88,166,185],"latency":[12],"of":[13,42,48,91],"deep":[14,31],"neural":[15],"network":[16],"models":[17,33],"on":[18,23,30,98,162],"GPUs,":[19],"with":[20],"special":[21],"focus":[22],"NVIDIA.":[24],"Unlike":[25],"prior":[26],"methods":[27],"that":[28,73,147],"rely":[29],"learning":[32],"or":[34],"handcrafted":[35],"heuristics,":[36],"PM2Lat":[37,92,109,125,148],"leverages":[38],"Single-Instruction-Multiple-Thread":[40],"architecture":[41],"GPUs":[43],"to":[44,94,110,131,182],"model":[45],"execution":[46],"time":[47],"DNN":[49],"models.":[50],"First,":[51],"we":[52,71],"dive":[53],"into":[54],"fine-grained":[55],"GPU":[56,75,134],"operation":[57],"modeling":[58,107],"by":[59,169,174],"studying":[60],"computational":[61],"behavior":[62],"memory":[64],"access":[65],"patterns.":[66],"After":[67],"identifying":[68],"these":[69],"characteristics,":[70],"found":[72],"different":[74,156],"kernels":[76,96,135],"exhibit":[77],"significant":[78],"performance":[79],"disparities,":[80],"even":[81],"when":[82],"serving":[83],"same":[85],"purpose.":[86],"Hence,":[87],"core":[89],"idea":[90],"is":[93,188],"differentiate":[95],"based":[97],"their":[99],"configurations":[100],"analyze":[102],"them":[103],"accordingly.":[104],"This":[105],"kernel-aware":[106],"enables":[108],"achieve":[111],"consistently":[112,149],"low":[113],"prediction":[114],"error":[115,151,186],"across":[116,155],"diverse":[117,183],"data":[118,157],"types":[119,158],"hardware":[121,160],"platforms.":[122],"In":[123],"addition,":[124],"generalizes":[126],"beyond":[127],"standard":[128],"matrix":[129],"multiplication":[130],"support":[132],"complex":[133],"such":[136],"as":[137],"Triton,":[138],"Flash":[139],"Attention,":[140],"Cutlass":[142],"Attention.":[143],"Experimental":[144],"results":[145],"show":[146],"achieves":[150],"rates":[152],"below":[153],"10%":[154],"platforms":[161],"Transformer":[163],"models,":[164],"outperforming":[165],"state-of-the-art":[167],"NeuSight":[168],"10-20%":[170],"FP32":[172],"at":[175,190],"least":[176],"50%":[177],"BF16.":[179],"When":[180],"applying":[181],"kernels,":[184],"rate":[187],"maintained":[189],"3-8%.":[191]},"counts_by_year":[],"updated_date":"2026-03-04T07:09:34.246503","created_date":"2026-03-04T00:00:00"}
