{"id":"https://openalex.org/W4417071251","doi":"https://doi.org/10.1145/3769780","title":"DiskJoin: Large-scale Vector Similarity Join with SSD","display_name":"DiskJoin: Large-scale Vector Similarity Join with SSD","publication_year":2025,"publication_date":"2025-12-04","ids":{"openalex":"https://openalex.org/W4417071251","doi":"https://doi.org/10.1145/3769780"},"language":"en","primary_location":{"id":"doi:10.1145/3769780","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3769780","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2508.18494","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5032367624","display_name":"Yanqi Chen","orcid":"https://orcid.org/0009-0001-9494-4947"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Yanqi Chen","raw_affiliation_strings":["University of Massachusetts Amherst, Amherst, MA, USA"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts Amherst, Amherst, MA, USA","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100367774","display_name":"Xiao Yan","orcid":"https://orcid.org/0000-0002-2122-915X"},"institutions":[{"id":"https://openalex.org/I37461747","display_name":"Wuhan University","ror":"https://ror.org/033vjfk17","country_code":"CN","type":"education","lineage":["https://openalex.org/I37461747"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiao Yan","raw_affiliation_strings":["Institute for Math &amp; AI, Wuhan, Wuhan University, Wuhan, China"],"affiliations":[{"raw_affiliation_string":"Institute for Math &amp; AI, Wuhan, Wuhan University, Wuhan, China","institution_ids":["https://openalex.org/I37461747"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019048013","display_name":"Alexandra Meliou","orcid":"https://orcid.org/0000-0001-7346-6002"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alexandra Meliou","raw_affiliation_strings":["University of Massachusetts Amherst, Amherst, MA, USA and Archimedes/Athena RC, Marousi, Greece"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts Amherst, Amherst, MA, USA and Archimedes/Athena RC, Marousi, Greece","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5011148671","display_name":"Eric Lo","orcid":"https://orcid.org/0000-0003-2679-3945"},"institutions":[{"id":"https://openalex.org/I177725633","display_name":"Chinese University of Hong Kong","ror":"https://ror.org/00t33hh48","country_code":"HK","type":"education","lineage":["https://openalex.org/I177725633"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Eric Lo","raw_affiliation_strings":["The Chinese University of Hong Kong, Hong Kong, China"],"affiliations":[{"raw_affiliation_string":"The Chinese University of Hong Kong, Hong Kong, China","institution_ids":["https://openalex.org/I177725633"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5032367624"],"corresponding_institution_ids":["https://openalex.org/I24603500"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.43149804,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"3","issue":"6","first_page":"1","last_page":"27"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.41440001130104065,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.41440001130104065,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.11890000104904175,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.07639999687671661,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/cache","display_name":"Cache","score":0.7131999731063843},{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.5737000107765198},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.5340999960899353},{"id":"https://openalex.org/keywords/probabilistic-logic","display_name":"Probabilistic logic","score":0.48399999737739563},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.47769999504089355},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4471000134944916},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4399000108242035},{"id":"https://openalex.org/keywords/cache-algorithms","display_name":"Cache algorithms","score":0.39969998598098755},{"id":"https://openalex.org/keywords/support-vector-machine","display_name":"Support vector machine","score":0.3855000138282776}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8493000268936157},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.7131999731063843},{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.5737000107765198},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.5340999960899353},{"id":"https://openalex.org/C49937458","wikidata":"https://www.wikidata.org/wiki/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.48399999737739563},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.48100000619888306},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.47769999504089355},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4471000134944916},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4399000108242035},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4259999990463257},{"id":"https://openalex.org/C38556500","wikidata":"https://www.wikidata.org/wiki/Q13404475","display_name":"Cache algorithms","level":4,"score":0.39969998598098755},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.3855000138282776},{"id":"https://openalex.org/C189783530","wikidata":"https://www.wikidata.org/wiki/Q352090","display_name":"CPU cache","level":3,"score":0.3149000108242035},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.3043000102043152},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2978000044822693},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.2973000109195709},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.29100000858306885},{"id":"https://openalex.org/C202623185","wikidata":"https://www.wikidata.org/wiki/Q375176","display_name":"Disk buffer","level":3,"score":0.2784999907016754},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.27570000290870667},{"id":"https://openalex.org/C36340418","wikidata":"https://www.wikidata.org/wiki/Q7124288","display_name":"Page cache","level":5,"score":0.266400009393692},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.2655999958515167},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C2777851325","wikidata":"https://www.wikidata.org/wiki/Q7094102","display_name":"Online model","level":2,"score":0.25769999623298645},{"id":"https://openalex.org/C113174947","wikidata":"https://www.wikidata.org/wiki/Q2859736","display_name":"Tree (set theory)","level":2,"score":0.257099986076355},{"id":"https://openalex.org/C188805328","wikidata":"https://www.wikidata.org/wiki/Q4060691","display_name":"Hash join","level":3,"score":0.25290000438690186},{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.2524999976158142}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3769780","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3769780","pdf_url":null,"source":{"id":"https://openalex.org/S4387289859","display_name":"Proceedings of the ACM on Management of Data","issn_l":"2836-6573","issn":["2836-6573"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM on Management of Data","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:2508.18494","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.18494","pdf_url":"https://arxiv.org/pdf/2508.18494","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2508.18494","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2508.18494","pdf_url":"https://arxiv.org/pdf/2508.18494","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":38,"referenced_works":["https://openalex.org/W1987225815","https://openalex.org/W2037562342","https://openalex.org/W2040977973","https://openalex.org/W2043224356","https://openalex.org/W2073167797","https://openalex.org/W2089347259","https://openalex.org/W2094662804","https://openalex.org/W2096598900","https://openalex.org/W2097117768","https://openalex.org/W2122646361","https://openalex.org/W2133845638","https://openalex.org/W2147717514","https://openalex.org/W2150916025","https://openalex.org/W2162659160","https://openalex.org/W2169795703","https://openalex.org/W2173213060","https://openalex.org/W2270660075","https://openalex.org/W2294518132","https://openalex.org/W2444127451","https://openalex.org/W2555648367","https://openalex.org/W2566913073","https://openalex.org/W2804542971","https://openalex.org/W2960484119","https://openalex.org/W2984806015","https://openalex.org/W2992678377","https://openalex.org/W3004538570","https://openalex.org/W3016712336","https://openalex.org/W3028864969","https://openalex.org/W3039554467","https://openalex.org/W4249742400","https://openalex.org/W4300175872","https://openalex.org/W4312356734","https://openalex.org/W4367046898","https://openalex.org/W4386123432","https://openalex.org/W4390647563","https://openalex.org/W4393183654","https://openalex.org/W4399163689","https://openalex.org/W4411403229"],"related_works":[],"abstract_inverted_index":{"Similarity":[0],"join-a":[1],"widely":[2],"used":[3],"operation":[4],"in":[5,76],"data":[6,32,117],"science-finds":[7],"all":[8],"pairs":[9,169],"of":[10,167],"items":[11],"that":[12,99,160,179],"have":[13],"distance":[14],"smaller":[15],"than":[16],"a":[17,38,59,84,107,133,156,164],"threshold.":[18],"Prior":[19],"work":[20],"has":[21],"explored":[22],"distributed":[23],"computation":[24],"methods":[25,36,78],"to":[26,30,120,141,188],"scale":[27],"similarity":[28,96],"join":[29,97],"large":[31,65,165],"volumes":[33],"but":[34,75],"these":[35,77],"require":[37],"cluster":[39],"deployment,":[40],"and":[41,62,124,136,146],"efficiency":[42],"suffers":[43],"from":[44,170,186],"expensive":[45],"inter-machine":[46],"communication.":[47],"On":[48],"the":[49,64,79,93,116],"other":[50],"hand,":[51],"disk-based":[52,95],"solutions":[53],"are":[54],"more":[55],"cost-effective":[56],"by":[57,114],"using":[58],"single":[60,108],"machine":[61],"storing":[63],"dataset":[66],"on":[67,106,174],"high-performance":[68],"external":[69],"storage,":[70],"such":[71],"as":[72,132],"NVMe":[73],"SSDs,":[74],"disk":[80,112,148],"I/O":[81,113],"time":[82],"is":[83],"serious":[85],"bottleneck.":[86],"In":[87],"this":[88],"paper,":[89],"we":[90,154],"propose":[91],"DiskJoin,":[92],"first":[94],"algorithm":[98],"can":[100,161],"process":[101],"billion-scale":[102],"vector":[103,168],"datasets":[104,177],"efficiently":[105],"machine.":[109],"DiskJoin":[110,180],"improves":[111],"tailoring":[115],"access":[118],"patterns":[119],"avoid":[121],"repetitive":[122],"accesses":[123],"read":[125],"amplification.":[126],"It":[127],"also":[128],"uses":[129],"main":[130],"memory":[131],"dynamic":[134],"cache":[135,139,143],"carefully":[137],"manages":[138],"eviction":[140],"improve":[142],"hit":[144],"rate":[145],"reduce":[147],"retrieval":[149],"time.":[150],"For":[151],"further":[152],"acceleration,":[153],"adopt":[155],"probabilistic":[157],"pruning":[158],"technique":[159],"effectively":[162],"prune":[163],"number":[166],"computation.":[171],"Our":[172],"evaluation":[173],"real-world,":[175],"large-scale":[176],"shows":[178],"significantly":[181],"outperforms":[182],"alternatives,":[183],"achieving":[184],"speedups":[185],"50\u00d7":[187],"1000\u00d7.":[189]},"counts_by_year":[],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-14T00:00:00"}
