{"id":"https://openalex.org/W4414692578","doi":"https://doi.org/10.1109/lca.2025.3616810","title":"Breaking the HBM Bit Cost Barrier: Domain-Specific ECC for AI Inference Infrastructure","display_name":"Breaking the HBM Bit Cost Barrier: Domain-Specific ECC for AI Inference Infrastructure","publication_year":2025,"publication_date":"2025-07-01","ids":{"openalex":"https://openalex.org/W4414692578","doi":"https://doi.org/10.1109/lca.2025.3616810"},"language":"en","primary_location":{"id":"doi:10.1109/lca.2025.3616810","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3616810","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045139717","display_name":"Rui Xie","orcid":"https://orcid.org/0000-0003-3177-5071"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rui Xie","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-3177-5071","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113362542","display_name":"Asad Ul Haq","orcid":null},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Asad Ul Haq","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0003-7975-0102","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yunhua Fang","orcid":"https://orcid.org/0009-0009-4718-8825"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yunhua Fang","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0009-4718-8825","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101209866","display_name":"Linsen Ma","orcid":"https://orcid.org/0009-0000-8535-7911"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Linsen Ma","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0000-8535-7911","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021284824","display_name":"Sanchari Sen","orcid":"https://orcid.org/0000-0003-0080-2882"},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sanchari Sen","raw_affiliation_strings":["IBM T.J. Watson Research Center, Yorktown Heights, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-0080-2882","affiliations":[{"raw_affiliation_string":"IBM T.J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010094713","display_name":"Swagath Venkataramani","orcid":"https://orcid.org/0000-0002-0470-6364"},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Swagath Venkataramani","raw_affiliation_strings":["IBM T.J. Watson Research Center, Yorktown Heights, NY, USA"],"raw_orcid":"https://orcid.org/0000-0002-0470-6364","affiliations":[{"raw_affiliation_string":"IBM T.J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100324294","display_name":"Liu Liu","orcid":"https://orcid.org/0000-0003-0792-8146"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Liu Liu","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0000-0003-0792-8146","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]},{"author_position":"last","author":{"id":null,"display_name":"Tong Zhang","orcid":"https://orcid.org/0009-0009-8005-0043"},"institutions":[{"id":"https://openalex.org/I165799507","display_name":"Rensselaer Polytechnic Institute","ror":"https://ror.org/01rtyzb94","country_code":"US","type":"education","lineage":["https://openalex.org/I165799507"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tong Zhang","raw_affiliation_strings":["Rensselaer Polytechnic Institute, Troy, NY, USA"],"raw_orcid":"https://orcid.org/0009-0009-8005-0043","affiliations":[{"raw_affiliation_string":"Rensselaer Polytechnic Institute, Troy, NY, USA","institution_ids":["https://openalex.org/I165799507"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5045139717"],"corresponding_institution_ids":["https://openalex.org/I165799507"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.30265826,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"24","issue":"2","first_page":"313","last_page":"316"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9718999862670898,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9718999862670898,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9652000069618225,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.963699996471405,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6863999962806702},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5583999752998352},{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.5138999819755554},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.48069998621940613},{"id":"https://openalex.org/keywords/error-detection-and-correction","display_name":"Error detection and correction","score":0.44690001010894775},{"id":"https://openalex.org/keywords/software-deployment","display_name":"Software deployment","score":0.4277999997138977},{"id":"https://openalex.org/keywords/efficient-energy-use","display_name":"Efficient energy use","score":0.38420000672340393},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.36390000581741333}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8456000089645386},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6863999962806702},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5583999752998352},{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.5138999819755554},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.48069998621940613},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.44690001010894775},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.4449999928474426},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.4277999997138977},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.4260999858379364},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.38420000672340393},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.36390000581741333},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3580999970436096},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3517000079154968},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.3366999924182892},{"id":"https://openalex.org/C2778820799","wikidata":"https://www.wikidata.org/wiki/Q3454688","display_name":"Cost reduction","level":2,"score":0.3321000039577484},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.32109999656677246},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.3165999948978424},{"id":"https://openalex.org/C115874739","wikidata":"https://www.wikidata.org/wiki/Q825377","display_name":"Critical path method","level":2,"score":0.30720001459121704},{"id":"https://openalex.org/C2775928411","wikidata":"https://www.wikidata.org/wiki/Q2041312","display_name":"Fault injection","level":3,"score":0.28769999742507935},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.27140000462532043},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.26919999718666077},{"id":"https://openalex.org/C56296756","wikidata":"https://www.wikidata.org/wiki/Q840922","display_name":"Bit error rate","level":3,"score":0.25870001316070557},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.257999986410141},{"id":"https://openalex.org/C33762810","wikidata":"https://www.wikidata.org/wiki/Q461671","display_name":"Data integrity","level":2,"score":0.2558000087738037},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.25380000472068787}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/lca.2025.3616810","is_oa":false,"landing_page_url":"https://doi.org/10.1109/lca.2025.3616810","pdf_url":null,"source":{"id":"https://openalex.org/S17643076","display_name":"IEEE Computer Architecture Letters","issn_l":"1556-6056","issn":["1556-6056","1556-6064","2473-2575"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Computer Architecture Letters","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":5,"referenced_works":["https://openalex.org/W2099839668","https://openalex.org/W2998617917","https://openalex.org/W3006586535","https://openalex.org/W3204625459","https://openalex.org/W4242862564"],"related_works":[],"abstract_inverted_index":{"High-Bandwidth":[0],"Memory":[1],"(HBM)":[2],"delivers":[3],"exceptional":[4],"bandwidth":[5],"and":[6,46,78,119],"energy":[7],"efficiency":[8],"for":[9],"AI":[10,153],"workloads,":[11],"but":[12],"its":[13],"high":[14],"cost":[15,40],"per":[16],"bit,":[17],"driven":[18],"in":[19,152],"part":[20],"by":[21,42],"stringent":[22],"on-die":[23,44],"reliability":[24,129],"requirements,":[25],"poses":[26],"a":[27,36,57,131,137,144],"growing":[28],"barrier":[29],"to":[30,39,51,74,101,124],"scalable":[31],"deployment.":[32],"This":[33],"work":[34],"explores":[35],"system-level":[37],"approach":[38],"reduction":[41],"eliminating":[43],"ECC":[45,59],"shifting":[47],"all":[48],"fault":[49],"management":[50],"the":[52,106],"memory":[53],"controller.":[54],"We":[55],"introduce":[56],"domain-specific":[58],"framework":[60],"combining":[61],"large-codeword":[62],"Reed\u2013Solomon":[63],"(RS)":[64],"correction":[65],"with":[66],"lightweight":[67],"fine-grained":[68],"CRC":[69],"detection,":[70],"differential":[71],"parity":[72],"updates":[73],"mitigate":[75],"write":[76],"amplification,":[77],"tunable":[79,132],"protection":[80],"based":[81],"on":[82],"data":[83],"importance.":[84],"Our":[85],"evaluation":[86],"using":[87],"LLM":[88],"inference":[89],"workloads":[90],"shows":[91],"that,":[92],"even":[93],"under":[94],"raw":[95],"HBM":[96,150],"bit":[97],"error":[98],"rates":[99],"up":[100],"<inline-formula":[102],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[103],"xmlns:xlink=\"http://www.w3.org/1999/xlink\"><tex-math":[104],"notation=\"LaTeX\">$10^{-3}$</tex-math></inline-formula>,":[105],"system":[107,133],"retains":[108],"78%":[109],"of":[110],"throughput":[111],"while":[112],"maintaining":[113],"at":[114],"least":[115],"97%":[116],"PIQA":[117],"accuracy":[118,122],"94%":[120],"MMLU":[121],"relative":[123],"error-free":[125],"HBM.":[126],"By":[127],"treating":[128],"as":[130],"parameter":[134],"rather":[135],"than":[136],"fixed":[138],"hardware":[139],"constraint,":[140],"our":[141],"design":[142],"opens":[143],"new":[145],"path":[146],"toward":[147],"low-cost,":[148],"high-performance":[149],"deployment":[151],"infrastructure.":[154]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
