{"id":"https://openalex.org/W7131668068","doi":"https://doi.org/10.48550/arxiv.2602.21897","title":"A task-based data-flow methodology for programming heterogeneous systems with multiple accelerator APIs","display_name":"A task-based data-flow methodology for programming heterogeneous systems with multiple accelerator APIs","publication_year":2026,"publication_date":"2026-02-25","ids":{"openalex":"https://openalex.org/W7131668068","doi":"https://doi.org/10.48550/arxiv.2602.21897"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.21897","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123779112","display_name":"Aleix Bon\u00e9","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bon\u00e9, Aleix","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073092626","display_name":"Alejandro Aguirre","orcid":"https://orcid.org/0000-0001-6746-2734"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aguirre, Alejandro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126880210","display_name":"David \u00c1lvarez","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"\u00c1lvarez, David","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123768656","display_name":"Pedro J. Martinez-Ferrer","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Martinez-Ferrer, Pedro J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123748199","display_name":"Vicen\u00e7 Beltran","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Beltran, Vicen\u00e7","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5123779112"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9039999842643738,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9039999842643738,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10904","display_name":"Embedded Systems Design Techniques","score":0.01940000057220459,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10126","display_name":"Logic, programming, and type systems","score":0.015699999406933784,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/symmetric-multiprocessor-system","display_name":"Symmetric multiprocessor system","score":0.47429999709129333},{"id":"https://openalex.org/keywords/programming-paradigm","display_name":"Programming paradigm","score":0.47290000319480896},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.4505999982357025},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.44029998779296875},{"id":"https://openalex.org/keywords/constraint-programming","display_name":"Constraint programming","score":0.4043999910354614},{"id":"https://openalex.org/keywords/thread","display_name":"Thread (computing)","score":0.40149998664855957},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.3797999918460846},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.37720000743865967},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.3544999957084656},{"id":"https://openalex.org/keywords/extensibility","display_name":"Extensibility","score":0.3474000096321106}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8019000291824341},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.47429999709129333},{"id":"https://openalex.org/C34165917","wikidata":"https://www.wikidata.org/wiki/Q188267","display_name":"Programming paradigm","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.47130000591278076},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.4505999982357025},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.44029998779296875},{"id":"https://openalex.org/C173404611","wikidata":"https://www.wikidata.org/wiki/Q528588","display_name":"Constraint programming","level":3,"score":0.4043999910354614},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.40149998664855957},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.3898000121116638},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.3797999918460846},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.37720000743865967},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.35989999771118164},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.3544999957084656},{"id":"https://openalex.org/C32833848","wikidata":"https://www.wikidata.org/wiki/Q4115054","display_name":"Extensibility","level":2,"score":0.3474000096321106},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.33899998664855957},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.336899995803833},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3353999853134155},{"id":"https://openalex.org/C99613125","wikidata":"https://www.wikidata.org/wiki/Q165194","display_name":"Application programming interface","level":2,"score":0.32190001010894775},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.3172999918460846},{"id":"https://openalex.org/C126831891","wikidata":"https://www.wikidata.org/wiki/Q221673","display_name":"Host (biology)","level":2,"score":0.3091999888420105},{"id":"https://openalex.org/C86111242","wikidata":"https://www.wikidata.org/wiki/Q859595","display_name":"Coprocessor","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C132525143","wikidata":"https://www.wikidata.org/wiki/Q141488","display_name":"Graph","level":2,"score":0.29159998893737793},{"id":"https://openalex.org/C55166926","wikidata":"https://www.wikidata.org/wiki/Q2892946","display_name":"Oracle","level":2,"score":0.2906000018119812},{"id":"https://openalex.org/C2777338717","wikidata":"https://www.wikidata.org/wiki/Q1762621","display_name":"Vendor","level":2,"score":0.28940001130104065},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.28369998931884766},{"id":"https://openalex.org/C2780870223","wikidata":"https://www.wikidata.org/wiki/Q1004415","display_name":"Runtime system","level":2,"score":0.2809000015258789},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.2786000072956085},{"id":"https://openalex.org/C74197172","wikidata":"https://www.wikidata.org/wiki/Q1195339","display_name":"Directed acyclic graph","level":2,"score":0.27399998903274536},{"id":"https://openalex.org/C193702766","wikidata":"https://www.wikidata.org/wiki/Q1414548","display_name":"Concurrency","level":2,"score":0.2728999853134155},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.27090001106262207},{"id":"https://openalex.org/C3701939","wikidata":"https://www.wikidata.org/wiki/Q5159109","display_name":"Concurrent object-oriented programming","level":5,"score":0.26739999651908875},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2660999894142151},{"id":"https://openalex.org/C67953723","wikidata":"https://www.wikidata.org/wiki/Q192525","display_name":"Workstation","level":2,"score":0.25279998779296875},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.250900000333786}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.21897","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.21897","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.21897","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.21897","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure","score":0.6628544330596924}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Heterogeneous":[0],"nodes":[1,233],"that":[2,203,241],"combine":[3],"multi-core":[4,160],"CPUs":[5],"with":[6,45,86,207],"diverse":[7],"accelerators":[8],"are":[9,115],"rapidly":[10],"becoming":[11],"the":[12,96,107,158,181,195,208],"norm":[13],"in":[14],"both":[15],"high-performance":[16],"computing":[17],"(HPC)":[18],"and":[19,36,52,64,76,94,126,139,169,184,222,234,250],"AI":[20,251],"infrastructures.":[21],"Exploiting":[22],"these":[23,92],"platforms,":[24],"however,":[25],"requires":[26],"orchestrating":[27],"several":[28],"low-level":[29],"accelerator":[30,101,147,218],"APIs":[31,88],"such":[32],"as":[33,117],"CUDA,":[34],"SYCL,":[35],"Triton.":[37],"In":[38],"some":[39],"occasions":[40],"they":[41,162],"can":[42],"be":[43],"combined":[44],"optimized":[46],"vendor":[47],"math":[48],"libraries:":[49],"e.g.,":[50],"cuBLAS":[51],"oneAPI.":[53],"Each":[54],"API":[55],"or":[56],"library":[57],"introduces":[58],"its":[59],"own":[60],"abstractions,":[61],"execution":[62],"semantics,":[63],"synchronization":[65],"mechanisms.":[66],"Combining":[67],"them":[68],"within":[69],"a":[70,81,118,191,212],"single":[71,213],"application":[72,214],"is":[73,227,235],"therefore":[74],"error-prone":[75],"labor-intensive.":[77],"We":[78,134],"propose":[79],"reusing":[80],"task-based":[82],"data-flow":[83],"methodology":[84,226],"together":[85],"Task-Aware":[87,136,141],"(TA-libs)":[89],"to":[90,149,167,187,215,230,238],"overcome":[91],"limitations":[93],"facilitate":[95],"seamless":[97],"integration":[98],"of":[99,123,194,246],"multiple":[100,153,217],"programming":[102,219],"models,":[103],"while":[104],"still":[105],"leveraging":[106],"best-in-class":[108],"kernels":[109,128],"offered":[110],"by":[111,130],"each":[112],"API.":[113],"Applications":[114],"expressed":[116],"directed":[119],"acyclic":[120],"graph":[121],"(DAG)":[122],"host":[124],"tasks":[125],"device":[127],"managed":[129],"an":[131],"OpenMP/OmpSs-2":[132],"runtime.":[133,199],"introduce":[135],"SYCL":[137],"(TASYCL)":[138],"leverage":[140],"CUDA":[142],"(TACUDA),":[143],"which":[144,188],"elevate":[145],"individual":[146],"invocations":[148],"first-class":[150],"tasks.":[151],"When":[152],"native":[154],"runtimes":[155],"coexist":[156],"on":[157],"same":[159],"CPU,":[161],"contend":[163],"for":[164],"threads,":[165],"leading":[166],"oversubscription":[168],"performance":[170],"variability.":[171],"To":[172],"address":[173],"this,":[174],"we":[175,189],"unify":[176],"their":[177],"thread":[178],"management":[179],"under":[180],"nOS-V":[182,209],"tasking":[183],"threading":[185],"library,":[186,210],"contribute":[190],"new":[192],"port":[193],"PoCL":[196],"(Portable":[197],"OpenCL)":[198],"These":[200],"results":[201],"demonstrate":[202],"task-aware":[204],"libraries,":[205],"coupled":[206],"enable":[211],"harness":[216],"models":[220],"transparently":[221],"efficiently.":[223],"The":[224],"proposed":[225],"immediately":[228],"applicable":[229],"current":[231],"heterogeneous":[232],"readily":[236],"extensible":[237],"future":[239],"systems":[240],"integrate":[242],"even":[243],"richer":[244],"combinations":[245],"CPUs,":[247],"GPUs,":[248],"FPGAs,":[249],"accelerators.":[252]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-27T00:00:00"}
