{"id":"https://openalex.org/W2086784677","doi":"https://doi.org/10.1145/2213836.2213976","title":"Fast sampling word correlations of high dimensional text data (abstract only)","display_name":"Fast sampling word correlations of high dimensional text data (abstract only)","publication_year":2012,"publication_date":"2012-05-20","ids":{"openalex":"https://openalex.org/W2086784677","doi":"https://doi.org/10.1145/2213836.2213976","mag":"2086784677"},"language":"en","primary_location":{"id":"doi:10.1145/2213836.2213976","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2213836.2213976","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2012 ACM SIGMOD International Conference on Management of Data","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5073404883","display_name":"F. Rosner","orcid":null},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Frank Rosner","raw_affiliation_strings":["Martin-Luther-University Halle-Wittenberg, Halle, Germany","Martin Luther University Halle-Wittenberg, Halle, Germany;"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-University Halle-Wittenberg, Halle, Germany","institution_ids":["https://openalex.org/I68956291"]},{"raw_affiliation_string":"Martin Luther University Halle-Wittenberg, Halle, Germany;","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5073994390","display_name":"Alexander Hinneburg","orcid":null},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Alexander Hinneburg","raw_affiliation_strings":["Martin-Luther-University Halle-Wittenberg, Halle, Germany","Martin Luther University Halle-Wittenberg, Halle, Germany;"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-University Halle-Wittenberg, Halle, Germany","institution_ids":["https://openalex.org/I68956291"]},{"raw_affiliation_string":"Martin Luther University Halle-Wittenberg, Halle, Germany;","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054521119","display_name":"Martin Gleditzsch","orcid":"https://orcid.org/0000-0001-5703-0129"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Martin Gleditzsch","raw_affiliation_strings":["Unister GmbH, Leipzig, Germany"],"affiliations":[{"raw_affiliation_string":"Unister GmbH, Leipzig, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037700427","display_name":"Mathias Priebe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mathias Priebe","raw_affiliation_strings":["Unister GmbH, Leipzig, Germany"],"affiliations":[{"raw_affiliation_string":"Unister GmbH, Leipzig, Germany","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5071736571","display_name":"Andreas Both","orcid":"https://orcid.org/0000-0002-9177-5463"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Andreas Both","raw_affiliation_strings":["Unister GmbH, Leipzig, Germany"],"affiliations":[{"raw_affiliation_string":"Unister GmbH, Leipzig, Germany","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5073404883"],"corresponding_institution_ids":["https://openalex.org/I68956291"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11835113,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"866","last_page":"866"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9854999780654907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/word","display_name":"Word (group theory)","score":0.8200507164001465},{"id":"https://openalex.org/keywords/locality-sensitive-hashing","display_name":"Locality-sensitive hashing","score":0.6772899627685547},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6663345694541931},{"id":"https://openalex.org/keywords/hash-function","display_name":"Hash function","score":0.6215186715126038},{"id":"https://openalex.org/keywords/cosine-similarity","display_name":"Cosine similarity","score":0.5503097772598267},{"id":"https://openalex.org/keywords/cluster-analysis","display_name":"Cluster analysis","score":0.5154350996017456},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.4754990041255951},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.4366368055343628},{"id":"https://openalex.org/keywords/sorting","display_name":"Sorting","score":0.4115546941757202},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.33971554040908813},{"id":"https://openalex.org/keywords/hash-table","display_name":"Hash table","score":0.2653005123138428},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.2300299108028412},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.1055772602558136}],"concepts":[{"id":"https://openalex.org/C90805587","wikidata":"https://www.wikidata.org/wiki/Q10944557","display_name":"Word (group theory)","level":2,"score":0.8200507164001465},{"id":"https://openalex.org/C74270461","wikidata":"https://www.wikidata.org/wiki/Q1625299","display_name":"Locality-sensitive hashing","level":4,"score":0.6772899627685547},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6663345694541931},{"id":"https://openalex.org/C99138194","wikidata":"https://www.wikidata.org/wiki/Q183427","display_name":"Hash function","level":2,"score":0.6215186715126038},{"id":"https://openalex.org/C2780762811","wikidata":"https://www.wikidata.org/wiki/Q1784941","display_name":"Cosine similarity","level":3,"score":0.5503097772598267},{"id":"https://openalex.org/C73555534","wikidata":"https://www.wikidata.org/wiki/Q622825","display_name":"Cluster analysis","level":2,"score":0.5154350996017456},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.4754990041255951},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.4366368055343628},{"id":"https://openalex.org/C111696304","wikidata":"https://www.wikidata.org/wiki/Q2303697","display_name":"Sorting","level":2,"score":0.4115546941757202},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33971554040908813},{"id":"https://openalex.org/C67388219","wikidata":"https://www.wikidata.org/wiki/Q207440","display_name":"Hash table","level":3,"score":0.2653005123138428},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2300299108028412},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.1055772602558136},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/2213836.2213976","is_oa":false,"landing_page_url":"https://doi.org/10.1145/2213836.2213976","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2012 ACM SIGMOD International Conference on Management of Data","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.4300000071525574,"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":6,"referenced_works":["https://openalex.org/W1736726159","https://openalex.org/W2012833704","https://openalex.org/W2089497633","https://openalex.org/W2113952909","https://openalex.org/W4236706032","https://openalex.org/W6637359826"],"related_works":["https://openalex.org/W4287326768","https://openalex.org/W2393322642","https://openalex.org/W3131198547","https://openalex.org/W2000601968","https://openalex.org/W2144265691","https://openalex.org/W2033383639","https://openalex.org/W3108918257","https://openalex.org/W3016124764","https://openalex.org/W2188250570","https://openalex.org/W4230735925"],"abstract_inverted_index":{"Finding":[0],"correlated":[1,31,62,169],"words":[2,26,96],"in":[3,230],"large":[4,260],"document":[5],"collections":[6],"is":[7,54,71,86,97],"an":[8,117,146],"important":[9],"ingredient":[10],"for":[11,29,227],"text":[12,261],"analytics.":[13],"The":[14,111],"na\u00efve":[15,185,232],"approach":[16],"computes":[17],"the":[18,57,90,98,101,104,127,130,136,140,183,190,193,207,225,231,244,248,253],"correlations":[19,201,229],"of":[20,47,59,82,100,107,129,139,182,192,247,255],"each":[21],"word":[22,32,63,67,109,122,141,154,170,194,211,249],"against":[23],"all":[24],"other":[25],"and":[27,49,160,199,209],"filters":[28],"highly":[30,61,168],"pairs.":[33,171],"Clearly,":[34],"this":[35,214],"quadratic":[36,218],"method":[37,85,174,215,223],"cannot":[38],"be":[39,114,240],"applied":[40],"to":[41,55,87,145,149,166,259],"real":[42],"world":[43],"scenarios":[44],"with":[45,234],"millions":[46],"documents":[48],"words.":[50],"Our":[51,172,221],"main":[52],"contribution":[53],"transform":[56],"task":[58],"finding":[60,235],"pairs":[64],"into":[65],"a":[66,157,163],"clustering":[68],"problem":[69],"that":[70,89],"efficiently":[72],"solved":[73],"by":[74,116,202,242],"locality":[75],"sensitive":[76],"hashing":[77],"(LSH).":[78],"A":[79],"key":[80],"insight":[81],"our":[83,256],"new":[84,173,222,257],"note":[88],"empirical":[91],"Pearson":[92],"correlation":[93],"between":[94,103,152],"two":[95],"cosine":[99,204],"angle":[102,112,159],"centered":[105,121,153,210],"versions":[106],"their":[108],"vectors.":[110,212,250],"can":[113,134,239],"approximated":[115],"LSH":[118,131,177],"scheme.":[119],"Although":[120],"vectors":[123,155,195],"are":[124],"not":[125],"sparse,":[126],"computation":[128],"hash":[132,236,245],"functions":[133],"exploit":[135],"inherent":[137],"sparsity":[138],"data.":[142],"This":[143,187],"leads":[144],"efficient":[147],"way":[148],"detect":[150],"collisions":[151],"having":[156],"small":[158],"therefore":[161],"provides":[162],"fast":[164],"algorithm":[165,188,233,258],"sample":[167],"based":[175],"on":[176,206],"improves":[178],"run":[179,219],"time":[180],"complexity":[181],"enhanced":[184],"algorithm.":[186],"reduces":[189],"dimensionality":[191],"using":[196],"random":[197],"projection":[198],"approximates":[200],"computing":[203],"similarity":[205],"reduced":[208],"However,":[213],"still":[216],"has":[217],"time.":[220],"replaces":[224],"filtering":[226],"high":[228],"collisions,":[237],"which":[238],"done":[241],"sorting":[243],"values":[246],"We":[251],"evaluate":[252],"scalability":[254],"collections.":[262]},"counts_by_year":[],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
