{"id":"https://openalex.org/W4416429522","doi":"https://doi.org/10.1109/iccad66269.2025.11240791","title":"Diff-DiT: Temporal Differential Accelerator for Low-bit Diffusion Transformers on FPGA","display_name":"Diff-DiT: Temporal Differential Accelerator for Low-bit Diffusion Transformers on FPGA","publication_year":2025,"publication_date":"2025-10-26","ids":{"openalex":"https://openalex.org/W4416429522","doi":"https://doi.org/10.1109/iccad66269.2025.11240791"},"language":null,"primary_location":{"id":"doi:10.1109/iccad66269.2025.11240791","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240791","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087976552","display_name":"Shidi Tang","orcid":"https://orcid.org/0000-0001-5493-7411"},"institutions":[{"id":"https://openalex.org/I4210090971","display_name":"Southeast University","ror":"https://ror.org/00cf0ab87","country_code":"BD","type":"education","lineage":["https://openalex.org/I4210090971"]}],"countries":["BD"],"is_corresponding":true,"raw_author_name":"Shidi Tang","raw_affiliation_strings":["School of Integrated Circuit, Southeast University"],"affiliations":[{"raw_affiliation_string":"School of Integrated Circuit, Southeast University","institution_ids":["https://openalex.org/I4210090971"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117545672","display_name":"Pengwei Zheng","orcid":null},"institutions":[{"id":"https://openalex.org/I4210090971","display_name":"Southeast University","ror":"https://ror.org/00cf0ab87","country_code":"BD","type":"education","lineage":["https://openalex.org/I4210090971"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Pengwei Zheng","raw_affiliation_strings":["School of Integrated Circuit, Southeast University"],"affiliations":[{"raw_affiliation_string":"School of Integrated Circuit, Southeast University","institution_ids":["https://openalex.org/I4210090971"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100761386","display_name":"Ruiqi Chen","orcid":"https://orcid.org/0000-0001-6837-5675"},"institutions":[{"id":"https://openalex.org/I13469542","display_name":"Vrije Universiteit Brussel","ror":"https://ror.org/006e5kg04","country_code":"BE","type":"education","lineage":["https://openalex.org/I13469542"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Ruiqi Chen","raw_affiliation_strings":["ETRO, Vrije Universiteit Brussel"],"affiliations":[{"raw_affiliation_string":"ETRO, Vrije Universiteit Brussel","institution_ids":["https://openalex.org/I13469542"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108874948","display_name":"Yuxuan Lv","orcid":"https://orcid.org/0009-0002-9128-1479"},"institutions":[{"id":"https://openalex.org/I4210090971","display_name":"Southeast University","ror":"https://ror.org/00cf0ab87","country_code":"BD","type":"education","lineage":["https://openalex.org/I4210090971"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Yuxuan Lv","raw_affiliation_strings":["School of Integrated Circuit, Southeast University"],"affiliations":[{"raw_affiliation_string":"School of Integrated Circuit, Southeast University","institution_ids":["https://openalex.org/I4210090971"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078788396","display_name":"Bruno da Silva","orcid":"https://orcid.org/0000-0002-4877-9688"},"institutions":[{"id":"https://openalex.org/I13469542","display_name":"Vrije Universiteit Brussel","ror":"https://ror.org/006e5kg04","country_code":"BE","type":"education","lineage":["https://openalex.org/I13469542"]}],"countries":["BE"],"is_corresponding":false,"raw_author_name":"Bruno Da Silva","raw_affiliation_strings":["ETRO, Vrije Universiteit Brussel"],"affiliations":[{"raw_affiliation_string":"ETRO, Vrije Universiteit Brussel","institution_ids":["https://openalex.org/I13469542"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078395317","display_name":"Ming Ling","orcid":"https://orcid.org/0000-0002-8866-7189"},"institutions":[{"id":"https://openalex.org/I4210090971","display_name":"Southeast University","ror":"https://ror.org/00cf0ab87","country_code":"BD","type":"education","lineage":["https://openalex.org/I4210090971"]}],"countries":["BD"],"is_corresponding":false,"raw_author_name":"Ming Ling","raw_affiliation_strings":["School of Integrated Circuit, Southeast University"],"affiliations":[{"raw_affiliation_string":"School of Integrated Circuit, Southeast University","institution_ids":["https://openalex.org/I4210090971"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5087976552"],"corresponding_institution_ids":["https://openalex.org/I4210090971"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.3561207,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"9"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2232999950647354,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.2232999950647354,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.12240000069141388,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.10400000214576721,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.7479000091552734},{"id":"https://openalex.org/keywords/field-programmable-gate-array","display_name":"Field-programmable gate array","score":0.6180999875068665},{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.5314000248908997},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.5246000289916992},{"id":"https://openalex.org/keywords/granularity","display_name":"Granularity","score":0.5097000002861023},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.47769999504089355},{"id":"https://openalex.org/keywords/differential","display_name":"Differential (mechanical device)","score":0.38429999351501465},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.37599998712539673}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7922999858856201},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.7479000091552734},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.6180999875068665},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5539000034332275},{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.5314000248908997},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.5246000289916992},{"id":"https://openalex.org/C177774035","wikidata":"https://www.wikidata.org/wiki/Q1246948","display_name":"Granularity","level":2,"score":0.5097000002861023},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.47769999504089355},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4422000050544739},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.390500009059906},{"id":"https://openalex.org/C93226319","wikidata":"https://www.wikidata.org/wiki/Q193137","display_name":"Differential (mechanical device)","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.37599998712539673},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.3659999966621399},{"id":"https://openalex.org/C134835016","wikidata":"https://www.wikidata.org/wiki/Q690265","display_name":"Lookup table","level":2,"score":0.36070001125335693},{"id":"https://openalex.org/C74750220","wikidata":"https://www.wikidata.org/wiki/Q2662197","display_name":"Differential evolution","level":2,"score":0.35339999198913574},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C124584101","wikidata":"https://www.wikidata.org/wiki/Q1053266","display_name":"Multiplier (economics)","level":2,"score":0.31790000200271606},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.3075999915599823},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.29899999499320984},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.29809999465942383},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C133162039","wikidata":"https://www.wikidata.org/wiki/Q1061077","display_name":"Code generation","level":3,"score":0.28540000319480896},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.26969999074935913},{"id":"https://openalex.org/C37724790","wikidata":"https://www.wikidata.org/wiki/Q210813","display_name":"Direct memory access","level":3,"score":0.26429998874664307},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.2506999969482422}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iccad66269.2025.11240791","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccad66269.2025.11240791","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":26,"referenced_works":["https://openalex.org/W2108598243","https://openalex.org/W3172512547","https://openalex.org/W4211085526","https://openalex.org/W4308479898","https://openalex.org/W4312497550","https://openalex.org/W4312933868","https://openalex.org/W4321637080","https://openalex.org/W4360831816","https://openalex.org/W4385245566","https://openalex.org/W4386065704","https://openalex.org/W4386072096","https://openalex.org/W4389162736","https://openalex.org/W4390872297","https://openalex.org/W4390873332","https://openalex.org/W4390874074","https://openalex.org/W4393578753","https://openalex.org/W4393949386","https://openalex.org/W4401211860","https://openalex.org/W4402778510","https://openalex.org/W4403278953","https://openalex.org/W4407953477","https://openalex.org/W4408151358","https://openalex.org/W4409248360","https://openalex.org/W4409248685","https://openalex.org/W4409282453","https://openalex.org/W4413147202"],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"Transformer":[1],"(DiT)":[2],"models":[3],"have":[4],"shown":[5],"superior":[6],"generative":[7],"capabilities":[8],"in":[9,40,160,165,182],"image":[10],"and":[11,53,81,131,139,146,163,179,184],"video":[12],"synthesis,":[13],"yet":[14],"their":[15],"high":[16],"computational":[17,126],"cost":[18],"during":[19,128],"inference":[20,69],"remains":[21],"a":[22,29,102,133],"critical":[23],"bottleneck.":[24],"Temporal":[25],"differential":[26,71,89],"computation":[27,54,145],"offers":[28],"promising":[30],"solution":[31],"to":[32,46,124,142],"low-bit":[33,67,106],"quantization":[34,80],"by":[35,158],"exploiting":[36],"the":[37,61,75,144],"temporal":[38],"similarity":[39],"activations.":[41],"However,":[42],"applying":[43],"this":[44,56],"technique":[45],"DiT\u2019s":[47],"Attention":[48],"layers":[49],"introduces":[50],"substantial":[51],"memory":[52,111,147],"overheads.In":[55],"paper,":[57],"we":[58,84],"present":[59],"Diff-DiT,":[60],"first":[62],"FPGA":[63],"accelerator":[64],"designed":[65],"for":[66],"DiT":[68,79],"with":[70,120,170],"computation.":[72],"To":[73],"overcome":[74],"unique":[76],"challenges":[77],"of":[78],"hardware":[82],"acceleration,":[83],"propose:":[85],"(1)":[86],"an":[87,114],"approximated":[88],"attention":[90,96],"(ADA)":[91],"method":[92],"that":[93,152],"selectively":[94],"approximates":[95],"computations":[97],"across":[98],"time":[99],"steps":[100],"using":[101],"significance":[103],"score,":[104],"enabling":[105],"on-chip":[107],"execution":[108],"while":[109],"minimizing":[110],"overhead;":[112],"(2)":[113],"optimal":[115],"cross-cast":[116],"data":[117,122],"accessing":[118],"pattern":[119],"flexible":[121],"reuse":[123],"maximize":[125],"intensity":[127],"matrix":[129],"multiplications;":[130],"(3)":[132],"half-condition":[134],"splitting":[135],"(HCS)":[136],"dataflow":[137],"optimization":[138],"fine-grained":[140],"pipelining":[141],"reduce":[143],"access":[148],"latency.Extensive":[149],"experiments":[150],"show":[151],"Diff-DiT":[153,175],"outperforms":[154],"NVIDIA":[155],"V100":[156],"GPU":[157],"1.39\u00d7":[159],"end-to-end":[161],"throughput":[162,183],"5.60\u00d7":[164],"energy":[166,185],"efficiency.":[167],"When":[168],"compared":[169],"state-of-the-art":[171],"diffusion":[172],"model":[173],"accelerators,":[174],"also":[176],"achieves":[177],"2.81\u00d7":[178],"2.77\u00d7":[180],"improvements":[181],"efficiency,":[186],"respectively.":[187],"Code":[188],"is":[189],"available":[190],"on":[191],"GitHub<sup":[192],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[193],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">1</sup>.":[194]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-11-20T00:00:00"}
