{"id":"https://openalex.org/W2120663175","doi":"https://doi.org/10.1109/tit.2017.2672725","title":"Preconditioned Data Sparsification for Big Data with Applications to PCA and K-means","display_name":"Preconditioned Data Sparsification for Big Data with Applications to PCA and K-means","publication_year":2017,"publication_date":"2017-01-01","ids":{"openalex":"https://openalex.org/W2120663175","doi":"https://doi.org/10.1109/tit.2017.2672725","mag":"2120663175"},"language":"en","primary_location":{"id":"doi:10.1109/tit.2017.2672725","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tit.2017.2672725","pdf_url":null,"source":{"id":"https://openalex.org/S4502562","display_name":"IEEE Transactions on Information Theory","issn_l":"0018-9448","issn":["0018-9448","1557-9654"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Theory","raw_type":"journal-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/1511.00152","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Farhad Pourkamali-Anaraki","orcid":"https://orcid.org/0000-0003-4078-1676"},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Farhad Pourkamali-Anaraki","raw_affiliation_strings":["Department of Electrical, Computer, and Energy Engineering, University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":"https://orcid.org/0000-0003-4078-1676","affiliations":[{"raw_affiliation_string":"Department of Electrical, Computer, and Energy Engineering, University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]},{"author_position":"last","author":{"id":null,"display_name":"Stephen Becker","orcid":null},"institutions":[{"id":"https://openalex.org/I188538660","display_name":"University of Colorado Boulder","ror":"https://ror.org/02ttsq026","country_code":"US","type":"education","lineage":["https://openalex.org/I188538660"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Stephen Becker","raw_affiliation_strings":["Department of Applied Mathematics, University of Colorado Boulder, Boulder, CO, USA"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Department of Applied Mathematics, University of Colorado Boulder, Boulder, CO, USA","institution_ids":["https://openalex.org/I188538660"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":4.9569,"has_fulltext":false,"cited_by_count":37,"citation_normalized_percentile":{"value":0.96056099,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"1"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.5720000267028809,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.5720000267028809,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10500","display_name":"Sparse and Compressive Sensing Techniques","score":0.38609999418258667,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11716","display_name":"Random Matrices and Applications","score":0.012900000438094139,"subfield":{"id":"https://openalex.org/subfields/2613","display_name":"Statistics and Probability"},"field":{"id":"https://openalex.org/fields/26","display_name":"Mathematics"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/principal-component-analysis","display_name":"Principal component analysis","score":0.6941999793052673},{"id":"https://openalex.org/keywords/estimator","display_name":"Estimator","score":0.6847000122070312},{"id":"https://openalex.org/keywords/big-data","display_name":"Big data","score":0.6617000102996826},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.5527999997138977},{"id":"https://openalex.org/keywords/data-compression","display_name":"Data compression","score":0.5271999835968018},{"id":"https://openalex.org/keywords/sparse-pca","display_name":"Sparse PCA","score":0.4984000027179718},{"id":"https://openalex.org/keywords/covariance","display_name":"Covariance","score":0.46889999508857727},{"id":"https://openalex.org/keywords/sampling-scheme","display_name":"Sampling scheme","score":0.43630000948905945},{"id":"https://openalex.org/keywords/robust-principal-component-analysis","display_name":"Robust principal component analysis","score":0.4311000108718872}],"concepts":[{"id":"https://openalex.org/C27438332","wikidata":"https://www.wikidata.org/wiki/Q2873","display_name":"Principal component analysis","level":2,"score":0.6941999793052673},{"id":"https://openalex.org/C185429906","wikidata":"https://www.wikidata.org/wiki/Q1130160","display_name":"Estimator","level":2,"score":0.6847000122070312},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.6617000102996826},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6614999771118164},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.5527999997138977},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.5271999835968018},{"id":"https://openalex.org/C24252448","wikidata":"https://www.wikidata.org/wiki/Q7573786","display_name":"Sparse PCA","level":3,"score":0.4984000027179718},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.48559999465942383},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.48100000619888306},{"id":"https://openalex.org/C178650346","wikidata":"https://www.wikidata.org/wiki/Q201984","display_name":"Covariance","level":2,"score":0.46889999508857727},{"id":"https://openalex.org/C2985139394","wikidata":"https://www.wikidata.org/wiki/Q49908","display_name":"Sampling scheme","level":3,"score":0.43630000948905945},{"id":"https://openalex.org/C2777749129","wikidata":"https://www.wikidata.org/wiki/Q17148469","display_name":"Robust principal component analysis","level":3,"score":0.4311000108718872},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.4171999990940094},{"id":"https://openalex.org/C185142706","wikidata":"https://www.wikidata.org/wiki/Q1134404","display_name":"Covariance matrix","level":2,"score":0.3935999870300293},{"id":"https://openalex.org/C56372850","wikidata":"https://www.wikidata.org/wiki/Q1050404","display_name":"Sparse matrix","level":3,"score":0.3765999972820282},{"id":"https://openalex.org/C29265498","wikidata":"https://www.wikidata.org/wiki/Q7047719","display_name":"Noise measurement","level":3,"score":0.36809998750686646},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C87007009","wikidata":"https://www.wikidata.org/wiki/Q210832","display_name":"Statistical hypothesis testing","level":2,"score":0.3212999999523163},{"id":"https://openalex.org/C180016635","wikidata":"https://www.wikidata.org/wiki/Q2712821","display_name":"Compression (physics)","level":2,"score":0.3160000145435333},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.2854999899864197},{"id":"https://openalex.org/C2985946229","wikidata":"https://www.wikidata.org/wiki/Q49908","display_name":"Data sampling","level":2,"score":0.28209999203681946},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2808000147342682},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2782000005245209},{"id":"https://openalex.org/C16910744","wikidata":"https://www.wikidata.org/wiki/Q7705759","display_name":"Test data","level":2,"score":0.27320000529289246},{"id":"https://openalex.org/C160920958","wikidata":"https://www.wikidata.org/wiki/Q7662746","display_name":"Synthetic data","level":2,"score":0.2720000147819519},{"id":"https://openalex.org/C49898467","wikidata":"https://www.wikidata.org/wiki/Q1517706","display_name":"Stratified sampling","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.2558000087738037},{"id":"https://openalex.org/C124851039","wikidata":"https://www.wikidata.org/wiki/Q2665459","display_name":"Compressed sensing","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2529999911785126}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/tit.2017.2672725","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tit.2017.2672725","pdf_url":null,"source":{"id":"https://openalex.org/S4502562","display_name":"IEEE Transactions on Information Theory","issn_l":"0018-9448","issn":["0018-9448","1557-9654"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Information Theory","raw_type":"journal-article"},{"id":"pmh:oai:arXiv.org:1511.00152","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1511.00152","pdf_url":"https://arxiv.org/pdf/1511.00152","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:oai:cuscholar:8w32r634h","is_oa":false,"landing_page_url":"https://scholar.colorado.edu/concern/articles/8w32r634h","pdf_url":null,"source":{"id":"https://openalex.org/S4306401957","display_name":"CU Scholar (University of Colorado Boulder)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I188538660","host_organization_name":"University of Colorado Boulder","host_organization_lineage":["https://openalex.org/I188538660"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:1511.00152","is_oa":true,"landing_page_url":"http://arxiv.org/abs/1511.00152","pdf_url":"https://arxiv.org/pdf/1511.00152","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":49,"referenced_works":["https://openalex.org/W147615212","https://openalex.org/W602904462","https://openalex.org/W1488435683","https://openalex.org/W1661223625","https://openalex.org/W1874995714","https://openalex.org/W1965972569","https://openalex.org/W1970950689","https://openalex.org/W1991799212","https://openalex.org/W2003207663","https://openalex.org/W2012528387","https://openalex.org/W2037757210","https://openalex.org/W2041836310","https://openalex.org/W2047244756","https://openalex.org/W2051245758","https://openalex.org/W2086486316","https://openalex.org/W2088658556","https://openalex.org/W2091281360","https://openalex.org/W2093813380","https://openalex.org/W2112662017","https://openalex.org/W2117756735","https://openalex.org/W2119667497","https://openalex.org/W2120961178","https://openalex.org/W2126069482","https://openalex.org/W2129932701","https://openalex.org/W2133105246","https://openalex.org/W2133157266","https://openalex.org/W2142827986","https://openalex.org/W2149631607","https://openalex.org/W2159563318","https://openalex.org/W2161160262","https://openalex.org/W2170962921","https://openalex.org/W2440866115","https://openalex.org/W2544661862","https://openalex.org/W2547648546","https://openalex.org/W2949910245","https://openalex.org/W2962963658","https://openalex.org/W2963088512","https://openalex.org/W2963455674","https://openalex.org/W2964249322","https://openalex.org/W4206039841","https://openalex.org/W6616365564","https://openalex.org/W6636339257","https://openalex.org/W6650267568","https://openalex.org/W6668990524","https://openalex.org/W6677280552","https://openalex.org/W6679279041","https://openalex.org/W6681872947","https://openalex.org/W6684512210","https://openalex.org/W6686067075"],"related_works":[],"abstract_inverted_index":{"We":[0,84,112],"analyze":[1],"a":[2,12,29,49,66,77,109,130],"compression":[3],"scheme":[4],"for":[5,87,97],"large":[6],"data":[7,20,138],"sets":[8],"that":[9,25,119,126],"randomly":[10],"keeps":[11],"small":[13],"percentage":[14],"of":[15,18,68,91,101],"the":[16,26,53,73,82,92,102,105],"components":[17],"each":[19],"sample.":[21],"The":[22,62],"benefit":[23,132],"is":[24,28,44,55,65],"output":[27],"sparse":[30],"matrix,":[31,94],"and":[32,57,95,125],"therefore,":[33],"subsequent":[34],"processing,":[35],"such":[36],"as":[37,140,142],"principal":[38],"component":[39],"analysis":[40],"(PCA)":[41],"or":[42],"K-means,":[43],"significantly":[45],"faster,":[46],"especially":[47],"in":[48,72,89,99,104],"distributed-data":[50],"setting.":[51],"Furthermore,":[52],"sampling":[54,63,148],"single-pass":[56],"applicable":[58],"to":[59,80,116,135],"streaming":[60],"data.":[61,83],"mechanism":[64],"variant":[67],"previous":[69],"methods":[70],"proposed":[71],"literature":[74],"combined":[75],"with":[76],"randomized":[78],"preconditioning":[79],"smooth":[81],"provide":[85,129],"guarantees":[86,96],"PCA":[88],"terms":[90,100],"covariance":[93],"K-means":[98],"error":[103],"center":[106],"estimators":[107],"at":[108],"given":[110],"step.":[111],"present":[113],"numerical":[114],"evidence":[115],"show":[117],"both":[118],"our":[120,127],"bounds":[121],"are":[122],"nearly":[123],"tight":[124],"algorithms":[128],"real":[131],"when":[133],"applied":[134],"standard":[136],"test":[137],"sets,":[139],"well":[141],"providing":[143],"certain":[144],"benefits":[145],"over":[146],"related":[147],"approaches.":[149]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":3},{"year":2022,"cited_by_count":2},{"year":2021,"cited_by_count":6},{"year":2020,"cited_by_count":6},{"year":2019,"cited_by_count":8},{"year":2018,"cited_by_count":8},{"year":2017,"cited_by_count":2}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2016-06-24T00:00:00"}
