{"id":"https://openalex.org/W4283214474","doi":"https://doi.org/10.48550/arxiv.2206.08566","title":"Active Data Discovery: Mining Unknown Data using Submodular Information Measures","display_name":"Active Data Discovery: Mining Unknown Data using Submodular Information Measures","publication_year":2022,"publication_date":"2022-06-17","ids":{"openalex":"https://openalex.org/W4283214474","doi":"https://doi.org/10.48550/arxiv.2206.08566"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2206.08566","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2206.08566","pdf_url":"https://arxiv.org/pdf/2206.08566","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2206.08566","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035865886","display_name":"Suraj Kothawade","orcid":"https://orcid.org/0000-0001-9405-6862"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Kothawade, Suraj","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024017460","display_name":"Shivang Chopra","orcid":"https://orcid.org/0000-0002-3567-852X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chopra, Shivang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102314238","display_name":"Saikat Ghosh","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ghosh, Saikat","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5000529247","display_name":"Rishabh Iyer","orcid":"https://orcid.org/0000-0001-9851-463X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Iyer, Rishabh","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5035865886"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12072","display_name":"Machine Learning and Algorithms","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9907000064849854,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11754","display_name":"SARS-CoV-2 detection and testing","score":0.9767000079154968,"subfield":{"id":"https://openalex.org/subfields/2725","display_name":"Infectious Diseases"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/submodular-set-function","display_name":"Submodular set function","score":0.90613853931427},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6985192894935608},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.6131634712219238},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.5268092155456543},{"id":"https://openalex.org/keywords/active-learning","display_name":"Active learning (machine learning)","score":0.5240914225578308},{"id":"https://openalex.org/keywords/rare-events","display_name":"Rare events","score":0.5175597071647644},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.5082783102989197},{"id":"https://openalex.org/keywords/event","display_name":"Event (particle physics)","score":0.4916859269142151},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4659605622291565},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4599186182022095},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.15114253759384155}],"concepts":[{"id":"https://openalex.org/C178621042","wikidata":"https://www.wikidata.org/wiki/Q7631710","display_name":"Submodular set function","level":2,"score":0.90613853931427},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6985192894935608},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.6131634712219238},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.5268092155456543},{"id":"https://openalex.org/C77967617","wikidata":"https://www.wikidata.org/wiki/Q4677561","display_name":"Active learning (machine learning)","level":2,"score":0.5240914225578308},{"id":"https://openalex.org/C2777317252","wikidata":"https://www.wikidata.org/wiki/Q18393516","display_name":"Rare events","level":2,"score":0.5175597071647644},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.5082783102989197},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.4916859269142151},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4659605622291565},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4599186182022095},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.15114253759384155},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2206.08566","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2206.08566","pdf_url":"https://arxiv.org/pdf/2206.08566","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2206.08566","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2206.08566","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2206.08566","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2206.08566","pdf_url":"https://arxiv.org/pdf/2206.08566","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W1595919516","https://openalex.org/W2945022594","https://openalex.org/W2966531942","https://openalex.org/W2194604332","https://openalex.org/W4379619607","https://openalex.org/W4386185001","https://openalex.org/W1989453388","https://openalex.org/W2922450688","https://openalex.org/W4313349366","https://openalex.org/W2969346978"],"abstract_inverted_index":{"Active":[0],"Learning":[1],"is":[2,97,129],"a":[3,20,52,57,77,121,169,176],"very":[4],"common":[5],"yet":[6],"powerful":[7],"framework":[8,146,172],"for":[9,66,124,218],"iteratively":[10],"and":[11,41,43,120,153,161,183,186,192,204,224],"adaptively":[12],"sampling":[13],"subsets":[14],"of":[15,28,46,59,80,92,178],"the":[16,23,26,47,90,116,125,157,197],"unlabeled":[17,198],"sets":[18],"with":[19,25,188,208],"human":[21],"in":[22,39,61,89,115,175,196],"loop":[24],"goal":[27],"achieving":[29],"labeling":[30,205],"efficiency.":[31],"Most":[32,72],"real":[33],"world":[34],"datasets":[35],"have":[36],"imbalance":[37],"either":[38],"classes":[40,107,154,191,223],"slices,":[42],"correspondingly,":[44],"parts":[45],"dataset":[48],"are":[49],"rare.":[50],"As":[51],"result,":[53],"there":[54],"has":[55],"been":[56],"lot":[58],"work":[60],"designing":[62],"active":[63,126,143,215],"learning":[64,127,216],"approaches":[65,73,217],"mining":[67],"these":[68,84,102,133,221],"rare":[69,85,103,134,190,193,222],"data":[70,86,104,135,144,151],"instances.":[71,87,136],"assume":[74,100],"access":[75],"to":[76,99,130,212],"seed":[78,117],"set":[79],"instances":[81,105],"which":[82,147,173],"contain":[83],"However,":[88],"event":[91],"more":[93],"extreme":[94],"rareness,":[95],"it":[96],"reasonable":[98],"that":[101],"(either":[106],"or":[108],"slices)":[109],"may":[110],"not":[111],"even":[112],"be":[113],"present":[114,195],"labeled":[118],"set,":[119],"critical":[122],"need":[123],"paradigm":[128],"efficiently":[131,155],"discover":[132],"In":[137],"this":[138],"work,":[139],"we":[140],"provide":[141,168],"an":[142],"discovery":[145],"can":[148],"mine":[149],"unknown":[150],"slices":[152,194],"using":[156],"submodular":[158,162],"conditional":[159,163],"gain":[160],"mutual":[164],"information":[165],"functions.":[166],"We":[167,200],"general":[170],"algorithmic":[171],"works":[174,187],"number":[177],"scenarios":[179],"including":[180],"image":[181],"classification":[182],"object":[184],"detection":[185],"both":[189],"set.":[199],"show":[201],"significant":[202],"accuracy":[203],"efficiency":[206],"gains":[207],"our":[209],"approach":[210],"compared":[211],"existing":[213],"state-of-the-art":[214],"actively":[219],"discovering":[220],"slices.":[225]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2025-10-10T00:00:00"}
