{"id":"https://openalex.org/W4387933214","doi":"https://doi.org/10.1145/3630108","title":"Fastensor: Optimise the Tensor I/O Path from SSD to GPU for Deep Learning Training","display_name":"Fastensor: Optimise the Tensor I/O Path from SSD to GPU for Deep Learning Training","publication_year":2023,"publication_date":"2023-10-25","ids":{"openalex":"https://openalex.org/W4387933214","doi":"https://doi.org/10.1145/3630108"},"language":"en","primary_location":{"id":"doi:10.1145/3630108","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3630108","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3630108","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3630108","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012128773","display_name":"Jia Wei","orcid":"https://orcid.org/0000-0002-2234-0378"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jia Wei","raw_affiliation_strings":["Xi\u2019an Jiaotong University, China"],"raw_orcid":"https://orcid.org/0000-0002-2234-0378","affiliations":[{"raw_affiliation_string":"Xi\u2019an Jiaotong University, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062123451","display_name":"Xingjun Zhang","orcid":"https://orcid.org/0000-0003-1434-7016"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingjun Zhang","raw_affiliation_strings":["Xi\u2019an Jiaotong University, China"],"raw_orcid":"https://orcid.org/0000-0003-1434-7016","affiliations":[{"raw_affiliation_string":"Xi\u2019an Jiaotong University, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078115749","display_name":"Longxiang Wang","orcid":"https://orcid.org/0000-0003-2005-114X"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Longxiang Wang","raw_affiliation_strings":["Xi\u2019an Jiaotong University, China"],"raw_orcid":"https://orcid.org/0000-0003-2005-114X","affiliations":[{"raw_affiliation_string":"Xi\u2019an Jiaotong University, China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100611605","display_name":"Zheng Wei","orcid":"https://orcid.org/0000-0002-2293-5427"},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zheng Wei","raw_affiliation_strings":["Xi\u2019an Jiaotong University, China"],"raw_orcid":"https://orcid.org/0000-0002-2293-5427","affiliations":[{"raw_affiliation_string":"Xi\u2019an Jiaotong University, China","institution_ids":["https://openalex.org/I87445476"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5012128773"],"corresponding_institution_ids":["https://openalex.org/I87445476"],"apc_list":null,"apc_paid":null,"fwci":0.683,"has_fulltext":true,"cited_by_count":6,"citation_normalized_percentile":{"value":0.7213525,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":98},"biblio":{"volume":"20","issue":"4","first_page":"1","last_page":"25"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9944999814033508,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.991599977016449,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9871000051498413,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.88353431224823},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5934661626815796},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5266713500022888},{"id":"https://openalex.org/keywords/transfer-of-learning","display_name":"Transfer of learning","score":0.46588554978370667},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4450760781764984},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.44454336166381836},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.43961015343666077},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43064606189727783},{"id":"https://openalex.org/keywords/computer-engineering","display_name":"Computer engineering","score":0.35974210500717163}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.88353431224823},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5934661626815796},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5266713500022888},{"id":"https://openalex.org/C150899416","wikidata":"https://www.wikidata.org/wiki/Q1820378","display_name":"Transfer of learning","level":2,"score":0.46588554978370667},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4450760781764984},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.44454336166381836},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.43961015343666077},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43064606189727783},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.35974210500717163},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3630108","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3630108","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3630108","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3630108","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3630108","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3630108","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[{"score":0.9100000262260437,"id":"https://metadata.un.org/sdg/7","display_name":"Affordable and clean energy"}],"awards":[{"id":"https://openalex.org/G4983796525","display_name":null,"funder_award_id":"62172327","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4387933214.pdf","grobid_xml":"https://content.openalex.org/works/W4387933214.grobid-xml"},"referenced_works_count":35,"referenced_works":["https://openalex.org/W6908809","https://openalex.org/W2194775991","https://openalex.org/W2338908902","https://openalex.org/W2489529491","https://openalex.org/W2602856279","https://openalex.org/W2883283076","https://openalex.org/W2901994046","https://openalex.org/W2912574597","https://openalex.org/W2912759934","https://openalex.org/W2937574366","https://openalex.org/W2952370363","https://openalex.org/W2962739339","https://openalex.org/W2964137095","https://openalex.org/W2971843695","https://openalex.org/W2982083293","https://openalex.org/W3010830594","https://openalex.org/W3012479151","https://openalex.org/W3012514909","https://openalex.org/W3034429256","https://openalex.org/W3106772683","https://openalex.org/W3120086278","https://openalex.org/W3138303811","https://openalex.org/W3138516171","https://openalex.org/W3196732841","https://openalex.org/W3205803342","https://openalex.org/W3206867815","https://openalex.org/W3207014500","https://openalex.org/W3211937550","https://openalex.org/W4225854672","https://openalex.org/W4226012237","https://openalex.org/W4283079959","https://openalex.org/W4286374952","https://openalex.org/W4289533912","https://openalex.org/W4301361180","https://openalex.org/W6600284362"],"related_works":["https://openalex.org/W4206357785","https://openalex.org/W3192840557","https://openalex.org/W4281381188","https://openalex.org/W2951211570","https://openalex.org/W3167935049","https://openalex.org/W3023427754","https://openalex.org/W4375928479","https://openalex.org/W3131673289","https://openalex.org/W3198847674","https://openalex.org/W3103940333"],"abstract_inverted_index":{"In":[0,100],"recent":[1,48],"years,":[2],"benefiting":[3],"from":[4,41,161,175],"the":[5,56,118,132,151,158,185,197,216,228,233,241,268,299,307,319],"increase":[6,298],"in":[7,18,184,240,255],"model":[8,168,259,286],"size":[9,231,303],"and":[10,22,59,66,83,121,142,200,232,262,310],"complexity,":[11],"deep":[12,25,245],"learning":[13,26,246],"has":[14,50,60],"achieved":[15],"tremendous":[16],"success":[17],"computer":[19],"vision":[20],"(CV)":[21],"(NLP).":[23],"Training":[24],"models":[27],"using":[28,94],"accelerators":[29],"such":[30,63],"as":[31,64,80],"GPUs":[32],"often":[33,92],"requires":[34],"much":[35],"iterative":[36],"data":[37,53,76,115,127,134,141,154],"to":[38,44,71,195,226,280,318],"be":[39],"transferred":[40,93],"NVMe":[42,119],"SSD":[43],"GPU":[45,67],"memory.":[46],"Much":[47],"work":[49],"focused":[51],"on":[52,104],"transfer":[54,98,116,155,205,222,266],"during":[55,77,167],"pre-processing":[57],"phase":[58],"introduced":[61],"techniques":[62],"multiprocessing":[65],"Direct":[68],"Storage":[69],"(GDS)":[70],"accelerate":[72],"it.":[73],"However,":[74],"tensor":[75,114,126,145,160,211,221,229],"training":[78,188,301],"(such":[79],"Checkpoints,":[81],"logs,":[82],"intermediate":[84,263,292],"feature":[85,264,293],"maps),":[86],"which":[87],"is":[88,91,173,224,313],"also":[89,138],"time-consuming,":[90],"traditional":[95,133],"serial,":[96],"long-I/O-path":[97],"methods.":[99],"this":[101],"article,":[102],"based":[103],"GDS":[105],"technology,":[106],"we":[107,130],"built":[108],"Fastensor,":[109],"an":[110],"efficient":[111],"tool":[112,156,172],"for":[113,157,207,285,291],"between":[117],"SSDs":[120],"GPUs.":[122],"To":[123],"achieve":[124],"higher":[125],"I/O":[128,135,146,321],"throughput,":[129],"optimized":[131],"process.":[136],"We":[137,190,213,236,248],"proposed":[139],"a":[140,162,176,274],"runtime":[143,234],"context-aware":[144],"algorithm.":[147],"Fastensor":[148,239,251,272,296],"can":[149,297],"select":[150],"most":[152],"suitable":[153],"current":[159],"candidate":[163],"set":[164],"of":[165,203,210,219,258],"tools":[166,206,223],"training.":[169],"The":[170],"optimal":[171],"derived":[174],"dictionary":[177],"generated":[178],"by":[179,304,315],"our":[180],"adaptive":[181],"exploration":[182],"algorithm":[183],"first":[186],"few":[187],"iterations.":[189],"used":[191,284,290],"Fastensor\u2019s":[192],"unified":[193],"interface":[194],"test":[196],"read/write":[198],"bandwidth":[199],"energy":[201],"consumption":[202],"different":[204,208,220],"sizes":[209],"blocks.":[212],"found":[214],"that":[215,250],"execution":[217],"efficiency":[218],"related":[225],"both":[227],"block":[230],"context.":[235],"then":[237],"deployed":[238],"widely":[242],"applicable":[243],"Pytorch":[244],"framework.":[247],"showed":[249],"could":[252],"perform":[253],"superior":[254],"typical":[256],"scenarios":[257],"parameter":[260,287],"saving":[261],"map":[265,294],"with":[267],"same":[269],"hardware":[270],"configuration.":[271],"achieves":[273],"5.37x":[275],"read":[276,309],"performance":[277],"improvement":[278],"compared":[279,317],"torch.save":[281],"()":[282],"when":[283],"saving.":[288],"When":[289],"transfer,":[295],"supported":[300],"batch":[302],"20x,":[305],"while":[306],"total":[308],"write":[311],"speed":[312],"increased":[314],"2.96x":[316],"torch":[320],"API.":[322]},"counts_by_year":[{"year":2025,"cited_by_count":4},{"year":2024,"cited_by_count":2}],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2025-10-10T00:00:00"}
