{"id":"https://openalex.org/W6977982540","doi":"https://doi.org/10.7907/ew2g-9986","title":"Data Complexity in Machine Learning and Novel Classification Algorithms","display_name":"Data Complexity in Machine Learning and Novel Classification Algorithms","publication_year":2006,"publication_date":"2006-05-30","ids":{"openalex":"https://openalex.org/W6977982540","doi":"https://doi.org/10.7907/ew2g-9986"},"language":"en","primary_location":{"id":"pmh:oai:thesis.library.caltech.edu:1361","is_oa":true,"landing_page_url":null,"pdf_url":"https://thesis.library.caltech.edu/1361/1/thesis.pdf","source":{"id":"https://openalex.org/S4306402402","display_name":"CaltechTHESIS (California Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I122411786","host_organization_name":"California Institute of Technology","host_organization_lineage":["https://openalex.org/I122411786"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Thesis"},"type":"article","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://thesis.library.caltech.edu/1361/1/thesis.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Li, Ling","orcid":null},"institutions":[{"id":"https://openalex.org/I122411786","display_name":"California Institute of Technology","ror":"https://ror.org/05dxps055","country_code":"US","type":"education","lineage":["https://openalex.org/I122411786"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Li, Ling","raw_affiliation_strings":["California Institute of Technology"],"affiliations":[{"raw_affiliation_string":"California Institute of Technology","institution_ids":["https://openalex.org/I122411786"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I122411786"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.48506386,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.6589999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.6589999794960022,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12002","display_name":"Computability, Logic, AI Algorithms","score":0.08720000088214874,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.06790000200271606,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.5142999887466431},{"id":"https://openalex.org/keywords/linear-classifier","display_name":"Linear classifier","score":0.4052000045776367},{"id":"https://openalex.org/keywords/time-complexity","display_name":"Time complexity","score":0.39879998564720154},{"id":"https://openalex.org/keywords/asymptotic-computational-complexity","display_name":"Asymptotic computational complexity","score":0.3977000117301941},{"id":"https://openalex.org/keywords/perceptron","display_name":"Perceptron","score":0.3955000042915344},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.3869999945163727},{"id":"https://openalex.org/keywords/external-data-representation","display_name":"External Data Representation","score":0.38659998774528503},{"id":"https://openalex.org/keywords/classifier","display_name":"Classifier (UML)","score":0.3653999865055084},{"id":"https://openalex.org/keywords/statistical-classification","display_name":"Statistical classification","score":0.3582000136375427},{"id":"https://openalex.org/keywords/kolmogorov-complexity","display_name":"Kolmogorov complexity","score":0.3529999852180481}],"concepts":[{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.6678000092506409},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6607999801635742},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6238999962806702},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.5142999887466431},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.5109999775886536},{"id":"https://openalex.org/C139532973","wikidata":"https://www.wikidata.org/wiki/Q2679259","display_name":"Linear classifier","level":3,"score":0.4052000045776367},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.39879998564720154},{"id":"https://openalex.org/C164215192","wikidata":"https://www.wikidata.org/wiki/Q13414364","display_name":"Asymptotic computational complexity","level":4,"score":0.3977000117301941},{"id":"https://openalex.org/C60908668","wikidata":"https://www.wikidata.org/wiki/Q690207","display_name":"Perceptron","level":3,"score":0.3955000042915344},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.3869999945163727},{"id":"https://openalex.org/C116409475","wikidata":"https://www.wikidata.org/wiki/Q1385056","display_name":"External Data Representation","level":2,"score":0.38659998774528503},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.3653999865055084},{"id":"https://openalex.org/C110083411","wikidata":"https://www.wikidata.org/wiki/Q1744628","display_name":"Statistical classification","level":2,"score":0.3582000136375427},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.3571999967098236},{"id":"https://openalex.org/C2779341405","wikidata":"https://www.wikidata.org/wiki/Q1456811","display_name":"Kolmogorov complexity","level":2,"score":0.3529999852180481},{"id":"https://openalex.org/C179717631","wikidata":"https://www.wikidata.org/wiki/Q2991667","display_name":"Multilayer perceptron","level":3,"score":0.34709998965263367},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.3463999927043915},{"id":"https://openalex.org/C52001869","wikidata":"https://www.wikidata.org/wiki/Q812530","display_name":"Naive Bayes classifier","level":3,"score":0.3278000056743622},{"id":"https://openalex.org/C66905080","wikidata":"https://www.wikidata.org/wiki/Q17005494","display_name":"Binary classification","level":3,"score":0.3264000117778778},{"id":"https://openalex.org/C45942800","wikidata":"https://www.wikidata.org/wiki/Q245652","display_name":"Ensemble learning","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.3237000107765198},{"id":"https://openalex.org/C50292564","wikidata":"https://www.wikidata.org/wiki/Q2462783","display_name":"Computational learning theory","level":3,"score":0.32010000944137573},{"id":"https://openalex.org/C21080849","wikidata":"https://www.wikidata.org/wiki/Q13611879","display_name":"Data point","level":2,"score":0.31450000405311584},{"id":"https://openalex.org/C136389625","wikidata":"https://www.wikidata.org/wiki/Q334384","display_name":"Supervised learning","level":3,"score":0.3034999966621399},{"id":"https://openalex.org/C2780724565","wikidata":"https://www.wikidata.org/wiki/Q5227256","display_name":"Data classification","level":2,"score":0.29409998655319214},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.2937000095844269},{"id":"https://openalex.org/C107673813","wikidata":"https://www.wikidata.org/wiki/Q812534","display_name":"Bayesian probability","level":2,"score":0.28610000014305115},{"id":"https://openalex.org/C79337645","wikidata":"https://www.wikidata.org/wiki/Q779824","display_name":"Outlier","level":2,"score":0.2838999927043915},{"id":"https://openalex.org/C115903097","wikidata":"https://www.wikidata.org/wiki/Q7094097","display_name":"Online machine learning","level":3,"score":0.27549999952316284},{"id":"https://openalex.org/C96563100","wikidata":"https://www.wikidata.org/wiki/Q8037118","display_name":"Worst-case complexity","level":3,"score":0.2702000141143799},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.26249998807907104},{"id":"https://openalex.org/C87465248","wikidata":"https://www.wikidata.org/wiki/Q1417790","display_name":"Minimum description length","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:thesis.library.caltech.edu:1361","is_oa":true,"landing_page_url":null,"pdf_url":"https://thesis.library.caltech.edu/1361/1/thesis.pdf","source":{"id":"https://openalex.org/S4306402402","display_name":"CaltechTHESIS (California Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I122411786","host_organization_name":"California Institute of Technology","host_organization_lineage":["https://openalex.org/I122411786"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Thesis"},{"id":"doi:10.7907/ew2g-9986","is_oa":true,"landing_page_url":"https://doi.org/10.7907/ew2g-9986","pdf_url":null,"source":{"id":"https://openalex.org/S7407050848","display_name":"Caltech Library","issn_l":null,"issn":[],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"thesis"}],"best_oa_location":{"id":"pmh:oai:thesis.library.caltech.edu:1361","is_oa":true,"landing_page_url":null,"pdf_url":"https://thesis.library.caltech.edu/1361/1/thesis.pdf","source":{"id":"https://openalex.org/S4306402402","display_name":"CaltechTHESIS (California Institute of Technology)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I122411786","host_organization_name":"California Institute of Technology","host_organization_lineage":["https://openalex.org/I122411786"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"Thesis"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W6977982540.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"thesis":[1],"summarizes":[2],"four":[3],"of":[4,12,19,42,48,66,102,182,299,327,385,435,441,447,455,469,477,535],"my":[5],"research":[6],"projects":[7],"in":[8,45,83,106,280,317,347,412,548],"machine":[9,84],"learning.":[10],"One":[11],"them":[13,244],"is":[14,56,75,124,196,287,356,377,452,463,545,566,573],"on":[15,173,253,314,344,459],"a":[16,59,121,169,180,197,205,310,324,336,433,442,505,520],"theoretical":[17],"challenge":[18],"defining":[20],"and":[21,33,94,112,140,156,224,250,364,473,509,554,562],"exploring":[22],"complexity":[23,44,55,65,105,139,154,162,171],"measures":[24],"for":[25,58,159,192,246,291,500],"data":[26,43,54,60,72,104,110,113,116,122,145],"sets;":[27],"the":[28,40,46,63,67,90,95,100,103,138,141,161,189,221,227,305,315,351,374,382,439,448,460,467,470,474,478,494,501,527,532,536,550,570],"others":[29],"are":[30,132,237,256,400,497],"about":[31],"new":[32,521],"improved":[34],"classification":[35,50,293,427],"algorithms.":[36,234],"We":[37,98,177,270,334,539],"first":[38],"investigate":[39],"role":[41],"context":[47],"binary":[49,292,436,461,495],"problems.":[51,176,294],"The":[52,235,295,445,564],"universal":[53],"defined":[57],"set":[61,123,142],"as":[62,87,323,560],"Kolmogorov":[64],"mapping":[68],"enforced":[69],"by":[70,127,465],"that":[71,120,149,201,255,262,275,307,318,373,394,407,525,542],"set.":[73],"It":[74],"closely":[76],"related":[77],"to":[78,137,186,282,304,379,402,432,531],"several":[79,174],"existing":[80,556],"principles":[81],"used":[82],"learning":[85,108,184,210,511,533],"such":[86,289,559],"Occam's":[88],"razor,":[89],"minimum":[91],"description":[92],"length,":[93],"Bayesian":[96],"approach.":[97],"demonstrate":[99,406],"application":[101],"two":[107],"problems,":[109,462],"decomposition":[111],"pruning.":[114],"In":[115,144,515],"decomposition,":[117],"we":[118,147,387,518],"illustrate":[119],"best":[125],"approximated":[126],"its":[128],"principal":[129],"subsets":[130],"which":[131,212,371,451],"Pareto":[133],"optimal":[134,492],"with":[135,168,204,232,268,438,484],"respect":[136],"size.":[143],"pruning,":[146],"show":[148,261,540],"outliers":[150],"usually":[151,225],"have":[152,395,421],"high":[153],"contributions,":[155],"propose":[157,179,519],"methods":[158,274],"estimating":[160],"contribution.":[163],"Experiments":[164,260],"were":[165],"carried":[166],"out":[167],"practical":[170],"measure":[172],"toy":[175],"then":[178],"family":[181],"novel":[183],"algorithms":[185,218,236,264,558],"directly":[187,219],"minimize":[188,220],"0/1":[190,222],"loss":[191],"perceptrons.":[193],"A":[194,425,481],"perceptron":[195,209],"linear":[198],"threshold":[199],"classifier":[200],"separates":[202],"examples":[203],"hyperplane.":[206],"Unlike":[207],"most":[208],"algorithms,":[211],"require":[213],"smooth":[214],"cost":[215,311,354,363,376,392,413,417],"functions,":[216,418],"our":[217,263,543],"loss,":[223],"achieve":[226,283],"lowest":[228],"training":[229,365],"error":[230,366],"compared":[231],"other":[233],"also":[238,271],"computationally":[239],"efficient.":[240],"Such":[241],"advantages":[242],"make":[243],"favorable":[245],"both":[247,466],"standalone":[248],"use":[249],"ensemble":[251,273,454],"learning,":[252],"problems":[254,437,496],"not":[257,489,574],"linearly":[258],"separable.":[259],"work":[265],"very":[266,546,575],"well":[267],"AdaBoost.":[269],"study":[272],"aggregate":[276],"many":[277],"base":[278,456,471,502,510,537,571],"hypotheses":[279],"order":[281],"better":[284,396,422],"performance.":[285,424],"AdaBoost":[286,300,352],"one":[288],"method":[290],"superior":[296],"out-of-sample":[297,397,423],"performance":[298,398,468],"has":[301],"been":[302],"attributed":[303],"fact":[306],"it":[308,319],"minimizes":[309],"function":[312,348,355],"based":[313,343],"margin,":[316],"can":[320,388,420,429],"be":[321,430,490,513],"viewed":[322],"special":[325],"case":[326],"AnyBoost,":[328],"an":[329,453],"abstract":[330,339],"gradient":[331,346],"descent":[332],"algorithm.":[333],"provide":[335],"more":[337,390],"sophisticated":[338],"boosting":[340,523],"algorithm,":[341],"CGBoost,":[342,386],"conjugate":[345],"space.":[349],"When":[350],"exponential":[353,375],"optimized,":[357],"CGBoost":[358,408,419],"generally":[359,409],"yields":[360],"much":[361],"lower":[362],"but":[367,399],"higher":[368],"test":[369],"error,":[370],"implies":[372],"vulnerable":[378],"overfitting.":[380],"With":[381,415],"optimization":[383],"power":[384],"adopt":[389],"\"regularized\"":[391],"functions":[393],"difficult":[401],"optimize.":[403],"Our":[404],"experiments":[405],"outperforms":[410,555],"AnyBoost":[411],"reduction.":[414],"suitable":[416],"multiclass":[426,522,551,557],"problem":[428],"reduced":[431],"collection":[434],"aid":[440],"coding":[443,479,482,528],"matrix.":[444,480],"quality":[446],"final":[449],"solution,":[450],"classifiers":[457],"learned":[458],"affected":[464],"learner":[472,572],"error-correcting":[475,486,508],"ability":[476,487,534],"matrix":[483,529],"strong":[485],"may":[488],"overall":[491],"if":[493],"too":[498],"hard":[499],"learner.":[503,538],"Thus":[504],"trade-off":[506],"between":[507],"should":[512],"sought.":[514],"this":[516],"paper,":[517],"algorithm":[524,544],"modifies":[526],"according":[530],"experimentally":[541],"efficient":[547],"optimizing":[549],"margin":[552],"cost,":[553],"AdaBoost.ECC":[561],"one-vs-one.":[563],"improvement":[565],"especially":[567],"significant":[568],"when":[569],"powerful.":[576]},"counts_by_year":[{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
