{"id":"https://openalex.org/W7138123567","doi":"https://doi.org/10.48550/arxiv.2603.14785","title":"SkipOPU: An FPGA-based Overlay Processor for Large Language Models with Dynamically Allocated Computation","display_name":"SkipOPU: An FPGA-based Overlay Processor for Large Language Models with Dynamically Allocated Computation","publication_year":2026,"publication_date":"2026-03-16","ids":{"openalex":"https://openalex.org/W7138123567","doi":"https://doi.org/10.48550/arxiv.2603.14785"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.14785","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129670053","display_name":"Zicheng He","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"He, Zicheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123911099","display_name":"Anhao Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Anhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129646285","display_name":"Xiaoyu Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Xiaoyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129672370","display_name":"Chen Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129702649","display_name":"Lei He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Lei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5129670053"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.18369999527931213,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.18369999527931213,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.1647000014781952,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.09290000051259995,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.7533000111579895},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6362000107765198},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4941999912261963},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.4693000018596649},{"id":"https://openalex.org/keywords/memory-bandwidth","display_name":"Memory bandwidth","score":0.45590001344680786},{"id":"https://openalex.org/keywords/exploit","display_name":"Exploit","score":0.44850000739097595},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.4438000023365021},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.44350001215934753},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.4020000100135803},{"id":"https://openalex.org/keywords/matrix-multiplication","display_name":"Matrix multiplication","score":0.3917999863624573}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.843500018119812},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.7533000111579895},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6362000107765198},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4941999912261963},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.48980000615119934},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4745999872684479},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4693000018596649},{"id":"https://openalex.org/C188045654","wikidata":"https://www.wikidata.org/wiki/Q17148339","display_name":"Memory bandwidth","level":2,"score":0.45590001344680786},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.44850000739097595},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.4438000023365021},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.44350001215934753},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.4020000100135803},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.3917999863624573},{"id":"https://openalex.org/C2780598303","wikidata":"https://www.wikidata.org/wiki/Q65921492","display_name":"Flexibility (engineering)","level":2,"score":0.3856000006198883},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.382099986076355},{"id":"https://openalex.org/C136085584","wikidata":"https://www.wikidata.org/wiki/Q910289","display_name":"Overlay","level":2,"score":0.36309999227523804},{"id":"https://openalex.org/C2781041963","wikidata":"https://www.wikidata.org/wiki/Q18348618","display_name":"Computation offloading","level":4,"score":0.362199991941452},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3538999855518341},{"id":"https://openalex.org/C27602214","wikidata":"https://www.wikidata.org/wiki/Q1868547","display_name":"Locality of reference","level":3,"score":0.34610000252723694},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.32519999146461487},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.3246000111103058},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.32010000944137573},{"id":"https://openalex.org/C152124472","wikidata":"https://www.wikidata.org/wiki/Q1204361","display_name":"Redundancy (engineering)","level":2,"score":0.3158000111579895},{"id":"https://openalex.org/C115874739","wikidata":"https://www.wikidata.org/wiki/Q825377","display_name":"Critical path method","level":2,"score":0.3138999938964844},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.30329999327659607},{"id":"https://openalex.org/C84462506","wikidata":"https://www.wikidata.org/wiki/Q173142","display_name":"Digital signal processing","level":2,"score":0.29420000314712524},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C2780945871","wikidata":"https://www.wikidata.org/wiki/Q194274","display_name":"Backup","level":2,"score":0.2874999940395355},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.2563999891281128},{"id":"https://openalex.org/C184596265","wikidata":"https://www.wikidata.org/wiki/Q2651576","display_name":"Model of computation","level":3,"score":0.2549999952316284},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2531000077724457},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.25290000438690186},{"id":"https://openalex.org/C526435321","wikidata":"https://www.wikidata.org/wiki/Q1303814","display_name":"Processor design","level":2,"score":0.25279998779296875},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2500999867916107}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.14785","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.14785","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2],"(LLMs)":[3],"have":[4],"achieved":[5],"remarkable":[6],"performance":[7],"across":[8,96,248],"a":[9,19,104,156,183],"wide":[10],"range":[11],"of":[12,42,194],"tasks,":[13],"but":[14],"their":[15,73],"inference":[16,80,234],"efficiency":[17,231],"remains":[18],"critical":[20],"bottleneck":[21],"due":[22],"to":[23,75,127,147,172,243],"rapidly":[24],"growing":[25],"parameters.":[26],"Recent":[27],"advances":[28],"in":[29,116,229],"dynamic":[30,79,236],"computation":[31,54,95,115,237],"allocation":[32,238],"address":[33],"this":[34,83],"challenge":[35],"by":[36,144,227],"exploiting":[37],"the":[38],"highly":[39],"uneven":[40],"contributions":[41],"different":[43],"tokens":[44,97],"and":[45,63,98,119,152,203,223,239],"layers,":[46],"enabling":[47],"selective":[48],"execution":[49],"that":[50,92,159,189,214],"significantly":[51],"reduces":[52],"redundant":[53],"while":[55,176],"preserving":[56],"model":[57],"accuracy.":[58],"However,":[59],"existing":[60],"hardware":[61,174],"platforms":[62],"accelerators":[64,226],"are":[65],"primarily":[66],"optimized":[67],"for":[68,138,232],"uniform,":[69],"static":[70],"execution,":[71],"limiting":[72],"ability":[74],"efficiently":[76,160],"support":[77],"such":[78],"patterns.":[81],"In":[82],"work,":[84],"we":[85,109,154,181],"propose":[86],"SkipOPU,":[87],"an":[88,217],"FPGA-based":[89,225],"overlay":[90],"processor":[91],"dynamically":[93],"allocates":[94],"layers":[99],"with":[100,130,235],"high":[101],"flexibility":[102],"through":[103,207],"lightweight":[105],"routing":[106],"mechanism.":[107],"First,":[108],"decouple":[110],"reduction":[111],"operations":[112,133],"from":[113],"element-wise":[114],"nonlinear":[117],"modules":[118],"perform":[120],"reductions":[121],"incrementally,":[122],"which":[123],"enables":[124],"both":[125],"stages":[126],"be":[128],"fused":[129],"adjacent":[131],"linear":[132],"(router":[134],"or":[135],"matrix":[136],"multiplication)":[137],"effective":[139],"latency":[140],"hiding.":[141],"Second,":[142],"motivated":[143],"asymmetric":[145],"sensitivity":[146],"numerical":[148],"precision":[149],"between":[150],"activation":[151],"weight,":[153],"design":[155],"PE":[157],"array":[158],"supports":[161],"float-fixed":[162],"hybrid":[163],"execution.":[164],"A":[165],"novel":[166],"DSP":[167],"overpacking":[168],"technique":[169],"is":[170],"introduced":[171],"maximize":[173],"utilization":[175],"minimizing":[177],"resource":[178],"overhead.":[179],"Finally,":[180],"develop":[182],"proactive":[184],"on-chip":[185,209],"KV":[186,192,245],"history":[187],"buffer":[188],"exploits":[190],"cross-layer":[191],"invariance":[193],"pruned":[195],"tokens,":[196],"eliminating":[197],"irregular":[198],"HBM":[199],"accesses":[200],"during":[201],"decoding":[202],"supplementing":[204],"off-chip":[205],"bandwidth":[206,230],"high-locality":[208],"reuse.":[210],"Experimental":[211],"results":[212],"demonstrate":[213],"SkipOPU":[215],"on":[216],"AMD":[218],"U280":[219],"FPGA":[220],"outperforms":[221],"GPU":[222],"other":[224],"1.23x-3.83x":[228],"LLMs":[233],"can":[240],"reduce":[241],"up":[242],"25.4%":[244],"storage":[246],"overhead":[247],"varying":[249],"sequence":[250],"lengths.":[251]},"counts_by_year":[],"updated_date":"2026-03-18T06:31:55.123368","created_date":"2026-03-18T00:00:00"}
