{"id":"https://openalex.org/W7131405113","doi":"https://doi.org/10.48550/arxiv.2602.20204","title":"Analyzing Latency Hiding and Parallelism in an MLIR-based AI Kernel Compiler","display_name":"Analyzing Latency Hiding and Parallelism in an MLIR-based AI Kernel Compiler","publication_year":2026,"publication_date":"2026-02-22","ids":{"openalex":"https://openalex.org/W7131405113","doi":"https://doi.org/10.48550/arxiv.2602.20204"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.20204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126813590","display_name":"Javed Absar","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Absar, Javed","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116182780","display_name":"Samarth Narang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Narang, Samarth","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5104040289","display_name":"Muthu Manikandan Baskaran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Baskaran, Muthu","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5126813590"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8226000070571899,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8226000070571899,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.03660000115633011,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.021900000050663948,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6183000206947327},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.5878999829292297},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.5566999912261963},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5055000185966492},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.4884999990463257},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.4406000077724457},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4036000072956085},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.3668999969959259},{"id":"https://openalex.org/keywords/compile-time","display_name":"Compile time","score":0.35749998688697815}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8792999982833862},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7580999732017517},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6183000206947327},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5878999829292297},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.5566999912261963},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5055000185966492},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.4884999990463257},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.4406000077724457},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4036000072956085},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.3668999969959259},{"id":"https://openalex.org/C200833197","wikidata":"https://www.wikidata.org/wiki/Q333707","display_name":"Compile time","level":3,"score":0.35749998688697815},{"id":"https://openalex.org/C41681595","wikidata":"https://www.wikidata.org/wiki/Q7917855","display_name":"Vectorization (mathematics)","level":2,"score":0.350600004196167},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.3499000072479248},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.3443000018596649},{"id":"https://openalex.org/C55526617","wikidata":"https://www.wikidata.org/wiki/Q719375","display_name":"Operand","level":2,"score":0.328000009059906},{"id":"https://openalex.org/C189930140","wikidata":"https://www.wikidata.org/wiki/Q1112878","display_name":"CAS latency","level":4,"score":0.3278000056743622},{"id":"https://openalex.org/C150552126","wikidata":"https://www.wikidata.org/wiki/Q339387","display_name":"SIMD","level":2,"score":0.32089999318122864},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.31299999356269836},{"id":"https://openalex.org/C7366592","wikidata":"https://www.wikidata.org/wiki/Q1255620","display_name":"Dram","level":2,"score":0.28790000081062317},{"id":"https://openalex.org/C168522837","wikidata":"https://www.wikidata.org/wiki/Q679552","display_name":"Branch predictor","level":2,"score":0.2827000021934509},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.2773999869823456},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.27300000190734863},{"id":"https://openalex.org/C76752949","wikidata":"https://www.wikidata.org/wiki/Q7607499","display_name":"Stencil","level":2,"score":0.2703999876976013},{"id":"https://openalex.org/C190902152","wikidata":"https://www.wikidata.org/wiki/Q1325106","display_name":"Optimizing compiler","level":3,"score":0.26100000739097595},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.25270000100135803}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.20204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.20204","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.20204","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.20204","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AI":[0],"kernel":[1],"compilation":[2,44],"for":[3,37,110],"edge":[4],"devices":[5],"depends":[6],"on":[7],"the":[8,19,77,107,136],"compiler's":[9],"ability":[10],"to":[11,61],"exploit":[12],"parallelism":[13],"and":[14,24,34,53,82,84,122,129],"hide":[15],"memory":[16,23],"latency":[17],"in":[18,41],"presence":[20],"of":[21,79,138],"hierarchical":[22],"explicit":[25],"data":[26],"movement.":[27],"This":[28],"paper":[29],"reports":[30],"a":[31,97],"benchmark":[32],"methodology":[33],"corresponding":[35],"results":[36,102],"three":[38],"compiler-controlled":[39],"mechanisms":[40],"an":[42,72],"MLIR-based":[43],"pipeline:":[45],"vectorization":[46,105],"(Vec),":[47],"multi-threading":[48],"(MT)":[49],"across":[50],"hardware":[51],"contexts,":[52],"double":[54],"buffering":[55],"(DB)":[56],"using":[57,94],"ping--pong":[58],"scratchpad":[59],"buffers":[60],"overlap":[62],"DMA":[63],"transfers":[64,128],"with":[65,91],"compute.":[66],"Using":[67],"Triton/Inductor-generated":[68],"kernels,":[69,112],"we":[70,85],"present":[71],"ablation":[73],"ladder":[74],"that":[75,104],"separates":[76],"contribution":[78],"Vec,":[80],"MT,":[81],"DB,":[83],"quantify":[86],"how":[87],"MT":[88,113],"speedup":[89],"scales":[90],"problem":[92],"size":[93],"GELU":[95],"as":[96],"representative":[98],"activation":[99],"kernel.":[100],"The":[101],"show":[103],"provides":[106,124],"primary":[108],"gain":[109],"bandwidth-sensitive":[111],"delivers":[114],"substantial":[115],"improvements":[116],"once":[117],"scheduling":[118],"overhead":[119],"is":[120],"amortized,":[121],"DB":[123],"additional":[125],"benefit":[126],"when":[127],"compute":[130],"can":[131],"be":[132],"overlapped":[133],"(i.e.,":[134],"outside":[135],"extremes":[137],"purely":[139,142],"memory-bound":[140],"or":[141],"compute-bound":[143],"behavior).":[144]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-26T00:00:00"}
