{"id":"https://openalex.org/W7134808207","doi":"https://doi.org/10.1109/access.2026.3671798","title":"CAGR: A Cross-Accelerator Graph Optimization Framework for Efficient Recommender System Inference","display_name":"CAGR: A Cross-Accelerator Graph Optimization Framework for Efficient Recommender System Inference","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7134808207","doi":"https://doi.org/10.1109/access.2026.3671798"},"language":"en","primary_location":{"id":"doi:10.1109/access.2026.3671798","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3671798","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1109/access.2026.3671798","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5121513870","display_name":"Zijian Shen","orcid":null},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Zijian Shen","raw_affiliation_strings":["Carnegie Mellon University, Pittsburgh, PA, USA"],"raw_orcid":"https://orcid.org/0009-0001-1157-6510","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, Pittsburgh, PA, USA","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128645800","display_name":"Wenyu Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210113369","display_name":"Microsoft Research Asia (China)","ror":"https://ror.org/0300m5276","country_code":"CN","type":"company","lineage":["https://openalex.org/I1290206253","https://openalex.org/I4210113369"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenyu Zhao","raw_affiliation_strings":["Microsoft, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0000-1510-8477","affiliations":[{"raw_affiliation_string":"Microsoft, Beijing, China","institution_ids":["https://openalex.org/I4210113369"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009846438","display_name":"B. Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I1174212","display_name":"University of Southern California","ror":"https://ror.org/03taz7m60","country_code":"US","type":"education","lineage":["https://openalex.org/I1174212"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Boyuan Wang","raw_affiliation_strings":["University of Southern California, Los Angeles, CA, USA"],"raw_orcid":"https://orcid.org/0009-0004-6474-1315","affiliations":[{"raw_affiliation_string":"University of Southern California, Los Angeles, CA, USA","institution_ids":["https://openalex.org/I1174212"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128690082","display_name":"Zimeng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I6902469","display_name":"Brandeis University","ror":"https://ror.org/05abbep66","country_code":"US","type":"education","lineage":["https://openalex.org/I6902469"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Zimeng Wang","raw_affiliation_strings":["Brandeis University, Waltham, MA, USA"],"raw_orcid":"https://orcid.org/0009-0005-8389-0831","affiliations":[{"raw_affiliation_string":"Brandeis University, Waltham, MA, USA","institution_ids":["https://openalex.org/I6902469"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5121476834","display_name":"Wenbin Shang","orcid":null},"institutions":[{"id":"https://openalex.org/I7882870","display_name":"University of Glasgow","ror":"https://ror.org/00vtgdb53","country_code":"GB","type":"education","lineage":["https://openalex.org/I7882870"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wenbin Shang","raw_affiliation_strings":["University of Glasgow, Glasgow, U.K"],"raw_orcid":"https://orcid.org/0009-0000-3808-3911","affiliations":[{"raw_affiliation_string":"University of Glasgow, Glasgow, U.K","institution_ids":["https://openalex.org/I7882870"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5121513870"],"corresponding_institution_ids":["https://openalex.org/I74973139"],"apc_list":{"value":1850,"currency":"USD","value_usd":1850},"apc_paid":{"value":1850,"currency":"USD","value_usd":1850},"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.34253359,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"14","issue":null,"first_page":"38544","last_page":"38562"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.6432999968528748,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.6432999968528748,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.05990000069141388,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11273","display_name":"Advanced Graph Neural Networks","score":0.04910000041127205,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/implementation","display_name":"Implementation","score":0.7215999960899353},{"id":"https://openalex.org/keywords/recommender-system","display_name":"Recommender system","score":0.6086999773979187},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5683000087738037},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.47029998898506165},{"id":"https://openalex.org/keywords/bayesian-optimization","display_name":"Bayesian optimization","score":0.46700000762939453},{"id":"https://openalex.org/keywords/graph","display_name":"Graph","score":0.46070000529289246},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.45010000467300415},{"id":"https://openalex.org/keywords/optimization-problem","display_name":"Optimization problem","score":0.42910000681877136},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4253000020980835}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8561999797821045},{"id":"https://openalex.org/C26713055","wikidata":"https://www.wikidata.org/wiki/Q245962","display_name":"Implementation","level":2,"score":0.7215999960899353},{"id":"https://openalex.org/C557471498","wikidata":"https://www.wikidata.org/wiki/Q554950","display_name":"Recommender system","level":2,"score":0.6086999773979187},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5683000087738037},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.47029998898506165},{"id":"https://openalex.org/C2778049539","wikidata":"https://www.wikidata.org/wiki/Q17002908","display_name":"Bayesian optimization","level":2,"score":0.46700000762939453},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.46070000529289246},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.45010000467300415},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.42910000681877136},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4253000020980835},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36169999837875366},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.36079999804496765},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35830000042915344},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.33970001339912415},{"id":"https://openalex.org/C154690210","wikidata":"https://www.wikidata.org/wiki/Q1668499","display_name":"Rewriting","level":2,"score":0.33550000190734863},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.3156999945640564},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.30790001153945923},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3075999915599823},{"id":"https://openalex.org/C147497476","wikidata":"https://www.wikidata.org/wiki/Q54872","display_name":"RDF","level":3,"score":0.2842000126838684},{"id":"https://openalex.org/C96333769","wikidata":"https://www.wikidata.org/wiki/Q907955","display_name":"Graph traversal","level":3,"score":0.2752000093460083},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.2703999876976013},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.265500009059906},{"id":"https://openalex.org/C558772884","wikidata":"https://www.wikidata.org/wiki/Q1508564","display_name":"Graph rewriting","level":3,"score":0.2597000002861023},{"id":"https://openalex.org/C74197172","wikidata":"https://www.wikidata.org/wiki/Q1195339","display_name":"Directed acyclic graph","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C139571649","wikidata":"https://www.wikidata.org/wiki/Q1156793","display_name":"Program optimization","level":3,"score":0.25690001249313354},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.25209999084472656}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/access.2026.3671798","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3671798","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},{"id":"pmh:oai:doaj.org/article:65561a9db0b0448ab43e64d54d815565","is_oa":true,"landing_page_url":"https://doaj.org/article/65561a9db0b0448ab43e64d54d815565","pdf_url":null,"source":{"id":"https://openalex.org/S4306401280","display_name":"DOAJ (DOAJ: Directory of Open Access Journals)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"IEEE Access, Vol 14, Pp 38544-38562 (2026)","raw_type":"article"}],"best_oa_location":{"id":"doi:10.1109/access.2026.3671798","is_oa":true,"landing_page_url":"https://doi.org/10.1109/access.2026.3671798","pdf_url":null,"source":{"id":"https://openalex.org/S2485537415","display_name":"IEEE Access","issn_l":"2169-3536","issn":["2169-3536"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Access","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recommender":[0],"systems":[1,224],"have":[2],"become":[3],"ubiquitous":[4],"in":[5],"modern":[6],"online":[7],"services,":[8],"yet":[9],"their":[10],"deployment":[11],"across":[12,75,145,185],"diverse":[13],"hardware":[14,65],"accelerators":[15],"remains":[16],"challenging":[17],"due":[18,48],"to":[19,49,70,202],"significant":[20],"performance":[21,213],"variations.":[22],"Contemporary":[23],"deep":[24],"learning":[25],"recommendation":[26,94,223],"models":[27,95],"(DLRMs),":[28],"such":[29],"as":[30],"DeepFM":[31],"and":[32,44,52,117,135,162,164,190],"NGCF,":[33],"exhibit":[34],"substantial":[35],"inference":[36,91],"latency":[37],"differences":[38],"when":[39],"executed":[40],"on":[41,168],"NVIDIA":[42,186],"GPUs,AMDGPUs,":[43],"Google":[45,191],"TPUs,":[46],"primarily":[47],"architectural":[50],"disparities":[51],"vendor-specific":[53],"optimization":[54,58,92,130,197],"strategies.":[55],"Existing":[56],"graph":[57,103],"frameworks":[59],"are":[60],"typically":[61],"designed":[62],"for":[63,93,154,221],"specific":[64],"backends,":[66],"lacking":[67],"the":[68,169],"flexibility":[69],"generate":[71],"portable":[72],"high-performance":[73],"implementations":[74,111,184],"heterogeneous":[76,138],"accelerators.":[77],"This":[78],"paper":[79],"presents":[80],"CAGR":[81,151,178,207],"(Cross-Accelerator":[82],"Graph":[83],"Rewriting),":[84],"a":[85,101,122,137],"novel":[86],"framework":[87],"that":[88,106,127,177],"achieves":[89,179],"performance-portable":[90],"through":[96],"three":[97],"key":[98],"innovations:":[99],"(1)":[100],"hardware-aware":[102],"rewriting":[104],"engine":[105],"dynamically":[107],"selects":[108],"optimal":[109],"operator":[110,118],"by":[112,199],"analyzing":[113],"compute-to-memory":[114],"bandwidth":[115],"ratios":[116],"density":[119],"characteristics;":[120],"(2)":[121],"reinforcement":[123],"learning-based":[124],"transformation":[125],"policy":[126],"learns":[128],"cross-platform":[129],"strategies":[131],"without":[132],"exhaustive":[133],"search;":[134],"(3)":[136],"pipeline":[139],"architecture":[140],"enabling":[141],"\u201coptimize":[142],"once,":[143],"deploy":[144],"supported":[146],"backends\u201d":[147],"semantics.":[148],"We":[149],"implement":[150],"with":[152,214],"support":[153],"multiple":[155],"kernel":[156],"backends":[157],"including":[158],"Triton,":[159],"cuBLAS,":[160],"MIOpen,":[161],"oneDNN,":[163],"demonstrate":[165],"its":[166],"effectiveness":[167],"Avazu":[170],"CTR":[171],"prediction":[172],"dataset.":[173],"Experimental":[174],"results":[175],"show":[176],"1.8-3.2\u00d7":[180],"speedup":[181],"over":[182],"baseline":[183],"V100,":[187],"AMD":[188],"MI100,":[189],"TPU":[192],"v3":[193],"platforms,":[194],"while":[195],"reducing":[196],"time":[198],"67%":[200],"compared":[201],"platform-specific":[203],"auto-tuning":[204],"approaches.":[205],"Furthermore,":[206],"maintains":[208],"92-96%":[209],"of":[210],"reference":[211],"optimized":[212],"zero":[215],"manual":[216],"intervention,":[217],"demonstrating":[218],"practical":[219],"viability":[220],"production":[222],"requiring":[225],"multi-vendor":[226],"deployment.":[227]},"counts_by_year":[],"updated_date":"2026-05-06T08:25:59.206177","created_date":"2026-03-11T00:00:00"}
