{"id":"https://openalex.org/W7161687244","doi":"https://doi.org/10.48550/arxiv.2605.18753","title":"DashAttention: Differentiable and Adaptive Sparse Hierarchical Attention","display_name":"DashAttention: Differentiable and Adaptive Sparse Hierarchical Attention","publication_year":2026,"publication_date":"2026-05-18","ids":{"openalex":"https://openalex.org/W7161687244","doi":"https://doi.org/10.48550/arxiv.2605.18753"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.18753","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18753","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.18753","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5136483432","display_name":"Yuxiang Huang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Yuxiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136475217","display_name":"Nuno M. T. Gon\u00e7alves","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gon\u00e7alves, Nuno M. T.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107179044","display_name":"Federico Alvetreti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Alvetreti, Federico","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136476247","display_name":"Lei Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Lei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136473240","display_name":"Xu Han","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124593149","display_name":"Edoardo M. Ponti","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ponti, Edoardo M.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5136495692","display_name":"Andr\u00e9 F. T. Martins","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Martins, Andr\u00e9 F. T.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5136475772","display_name":"Marcos V. Treviso","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Treviso, Marcos V.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.14390000700950623,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.14390000700950623,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.13210000097751617,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.11410000175237656,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/softmax-function","display_name":"Softmax function","score":0.6704999804496765},{"id":"https://openalex.org/keywords/hierarchy","display_name":"Hierarchy","score":0.6100000143051147},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.546999990940094},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.49140000343322754},{"id":"https://openalex.org/keywords/transformation","display_name":"Transformation (genetics)","score":0.4133000075817108},{"id":"https://openalex.org/keywords/pareto-principle","display_name":"Pareto principle","score":0.40299999713897705},{"id":"https://openalex.org/keywords/variable","display_name":"Variable (mathematics)","score":0.3781999945640564},{"id":"https://openalex.org/keywords/current","display_name":"Current (fluid)","score":0.35260000824928284}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7670000195503235},{"id":"https://openalex.org/C188441871","wikidata":"https://www.wikidata.org/wiki/Q7554146","display_name":"Softmax function","level":3,"score":0.6704999804496765},{"id":"https://openalex.org/C31170391","wikidata":"https://www.wikidata.org/wiki/Q188619","display_name":"Hierarchy","level":2,"score":0.6100000143051147},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.546999990940094},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5091999769210815},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.49140000343322754},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.4133000075817108},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.40299999713897705},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.39989998936653137},{"id":"https://openalex.org/C182365436","wikidata":"https://www.wikidata.org/wiki/Q50701","display_name":"Variable (mathematics)","level":2,"score":0.3781999945640564},{"id":"https://openalex.org/C148043351","wikidata":"https://www.wikidata.org/wiki/Q4456944","display_name":"Current (fluid)","level":2,"score":0.35260000824928284},{"id":"https://openalex.org/C144986985","wikidata":"https://www.wikidata.org/wiki/Q871236","display_name":"Hierarchical database model","level":2,"score":0.33219999074935913},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.3124000132083893},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.30720001459121704},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.29809999465942383},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.28780001401901245},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.27410000562667847},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.26759999990463257},{"id":"https://openalex.org/C202615002","wikidata":"https://www.wikidata.org/wiki/Q783507","display_name":"Differentiable function","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.18753","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18753","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.18753","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.18753","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"hierarchical":[1,113],"attention":[2,19,26,114,142],"methods,":[3,115],"such":[4],"as":[5,140],"NSA":[6,152],"and":[7,21,46,55,65,146,153],"InfLLMv2,":[8,154],"select":[9,78],"the":[10,28,32,36,49,53,72,86,90,100,105],"top-k":[11,33],"relevant":[12,39],"key-value":[13],"(KV)":[14],"blocks":[15,83],"based":[16],"on":[17,27],"coarse":[18],"scores":[20],"subsequently":[22],"apply":[23],"fine-grained":[24],"softmax":[25,102],"selected":[29],"tokens.":[30],"However,":[31],"operation":[34],"assumes":[35],"number":[37,81],"of":[38,82,166,174],"tokens":[40],"for":[41,99],"any":[42],"query":[43,88],"is":[44,120],"fixed":[45],"it":[47],"precludes":[48],"gradient":[50],"flow":[51],"between":[52],"sparse":[54,74],"dense":[56],"stages.":[57],"In":[58],"this":[59],"work,":[60],"we":[61,116],"propose":[62],"DashAttention":[63,119,136,167,183],"(Differentiable":[64],"Adaptive":[66],"Sparse":[67],"Hierarchical":[68],"Attention),":[69],"which":[70,170],"leverages":[71],"adaptively":[73],"$\u03b1$-entmax":[75],"transformation":[76],"to":[77,85,111,123,176,188],"a":[79,97,147,172,185],"variable":[80],"according":[84],"current":[87],"in":[89,94,156,168],"first":[91],"stage.":[92],"This":[93],"turn":[95],"provides":[96],"prior":[98],"second-stage":[101],"attention,":[103],"keeping":[104],"entire":[106],"hierarchy":[107],"fully":[108],"differentiable.":[109],"Contrary":[110],"other":[112],"show":[117,134],"that":[118,135],"non-dispersive,":[121],"translating":[122],"better":[124,148],"long-context":[125],"modeling":[126],"ability.":[127],"Experiments":[128],"with":[129,143],"large":[130],"language":[131],"models":[132],"(LLMs)":[133],"achieves":[137,171],"comparable":[138],"accuracy":[139],"full":[141],"75%":[144],"sparsity":[145],"Pareto":[149],"frontier":[150],"than":[151],"especially":[155],"high-sparsity":[157],"regimes.":[158],"We":[159],"also":[160],"provide":[161],"an":[162],"efficient,":[163],"GPU-aware":[164],"implementation":[165],"Triton,":[169],"speedup":[173],"up":[175],"over":[177],"FlashAttention-3":[178],"at":[179],"inference":[180],"time.":[181],"Overall,":[182],"offers":[184],"cost-effective":[186],"strategy":[187],"model":[189],"long":[190],"contexts.":[191]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-20T00:00:00"}
