{"id":"https://openalex.org/W7156024806","doi":"https://doi.org/10.48550/arxiv.2604.22242","title":"Fast GPU Linear Algebra via Compile Time Expression Fusion","display_name":"Fast GPU Linear Algebra via Compile Time Expression Fusion","publication_year":2026,"publication_date":"2026-04-24","ids":{"openalex":"https://openalex.org/W7156024806","doi":"https://doi.org/10.48550/arxiv.2604.22242"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.22242","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22242","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.22242","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5026833192","display_name":"Ryan R. Curtin","orcid":"https://orcid.org/0000-0002-9903-8214"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Curtin, Ryan R.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050963242","display_name":"Marcus Edel","orcid":"https://orcid.org/0000-0001-5445-7303"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Edel, Marcus","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5054815937","display_name":"Conrad Sanderson","orcid":"https://orcid.org/0000-0002-0049-4501"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sanderson, Conrad","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9223999977111816,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9223999977111816,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.02419999986886978,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.010999999940395355,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/linear-algebra","display_name":"Linear algebra","score":0.698199987411499},{"id":"https://openalex.org/keywords/compiler","display_name":"Compiler","score":0.6026999950408936},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5652999877929688},{"id":"https://openalex.org/keywords/bandicoot","display_name":"Bandicoot","score":0.5138999819755554},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4674000144004822},{"id":"https://openalex.org/keywords/metaprogramming","display_name":"Metaprogramming","score":0.35190001130104065}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7128999829292297},{"id":"https://openalex.org/C139352143","wikidata":"https://www.wikidata.org/wiki/Q82571","display_name":"Linear algebra","level":2,"score":0.698199987411499},{"id":"https://openalex.org/C169590947","wikidata":"https://www.wikidata.org/wiki/Q47506","display_name":"Compiler","level":2,"score":0.6026999950408936},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5652999877929688},{"id":"https://openalex.org/C113706061","wikidata":"https://www.wikidata.org/wiki/Q15098051","display_name":"Bandicoot","level":3,"score":0.5138999819755554},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4674000144004822},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4422000050544739},{"id":"https://openalex.org/C35390924","wikidata":"https://www.wikidata.org/wiki/Q661075","display_name":"Metaprogramming","level":2,"score":0.35190001130104065},{"id":"https://openalex.org/C18364862","wikidata":"https://www.wikidata.org/wiki/Q579978","display_name":"Homological algebra","level":3,"score":0.3334999978542328},{"id":"https://openalex.org/C200833197","wikidata":"https://www.wikidata.org/wiki/Q333707","display_name":"Compile time","level":3,"score":0.31040000915527344},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.30730000138282776},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3061000108718872},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.22242","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22242","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.22242","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.22242","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6375892758369446}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"We":[0],"describe":[1],"the":[2,25,69],"Bandicoot":[3,43,81],"GPU":[4,50],"linear":[5,29,88],"algebra":[6,30,89],"toolkit,":[7],"a":[8],"C++":[9],"based":[10],"library":[11],"that":[12,59,80],"prioritises":[13],"ease":[14],"of":[15],"use":[16],"without":[17],"compromising":[18],"efficiency.":[19],"Bandicoot's":[20],"API":[21],"is":[22],"compatible":[23],"with":[24],"popular":[26],"Armadillo":[27],"CPU":[28],"library,":[31],"enabling":[32],"easy":[33],"transition":[34],"for":[35,71],"existing":[36],"CPU-based":[37],"codebases.":[38],"Unlike":[39],"other":[40],"GPU-focused":[41],"toolkits,":[42],"uses":[44],"template":[45],"metaprogramming":[46],"to":[47,63],"generate":[48],"fused":[49],"kernels":[51,58],"directly":[52],"at":[53],"compile":[54],"time,":[55],"yielding":[56],"efficient":[57],"are":[60],"often":[61],"able":[62],"saturate":[64],"memory":[65],"bandwidth.":[66],"This":[67],"removes":[68],"need":[70],"runtime":[72],"overhead":[73],"or":[74],"JIT":[75],"infrastructure.":[76],"Empirical":[77],"results":[78],"show":[79],"outperforms":[82],"(sometimes":[83],"by":[84],"considerable":[85],"margins)":[86],"commonly-used":[87],"toolkits":[90],"including":[91],"PyTorch,":[92],"TensorFlow,":[93],"and":[94],"JAX.":[95]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-28T00:00:00"}
