{"id":"https://openalex.org/W4403681535","doi":"https://doi.org/10.1145/3689904.3694702","title":"Who's in and who's out? A case study of multimodal CLIP-filtering in DataComp","display_name":"Who's in and who's out? A case study of multimodal CLIP-filtering in DataComp","publication_year":2024,"publication_date":"2024-10-23","ids":{"openalex":"https://openalex.org/W4403681535","doi":"https://doi.org/10.1145/3689904.3694702"},"language":"en","primary_location":{"id":"doi:10.1145/3689904.3694702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3689904.3694702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3689904.3694702","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5052083701","display_name":"Rachel Hong","orcid":"https://orcid.org/0009-0005-4275-653X"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Rachel Hong","raw_affiliation_strings":["University of Washington, United States of America"],"raw_orcid":"https://orcid.org/0009-0005-4275-653X","affiliations":[{"raw_affiliation_string":"University of Washington, United States of America","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011791808","display_name":"William Agnew","orcid":"https://orcid.org/0000-0002-1362-554X"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"William Agnew","raw_affiliation_strings":["Carnegie Mellon University, United States of America"],"raw_orcid":"https://orcid.org/0000-0002-1362-554X","affiliations":[{"raw_affiliation_string":"Carnegie Mellon University, United States of America","institution_ids":["https://openalex.org/I74973139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026130910","display_name":"Tadayoshi Kohno","orcid":null},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tadayoshi Kohno","raw_affiliation_strings":["University of Washington, United States of America"],"raw_orcid":"https://orcid.org/0000-0002-4899-226X","affiliations":[{"raw_affiliation_string":"University of Washington, United States of America","institution_ids":["https://openalex.org/I201448701"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045592180","display_name":"Jamie Morgenstern","orcid":"https://orcid.org/0000-0003-3753-8405"},"institutions":[{"id":"https://openalex.org/I201448701","display_name":"University of Washington","ror":"https://ror.org/00cvxb145","country_code":"US","type":"education","lineage":["https://openalex.org/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jamie Morgenstern","raw_affiliation_strings":["University of Washington, United States of America"],"raw_orcid":"https://orcid.org/0000-0003-3753-8405","affiliations":[{"raw_affiliation_string":"University of Washington, United States of America","institution_ids":["https://openalex.org/I201448701"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5052083701"],"corresponding_institution_ids":["https://openalex.org/I201448701"],"apc_list":null,"apc_paid":null,"fwci":2.9802,"has_fulltext":false,"cited_by_count":9,"citation_normalized_percentile":{"value":0.92384158,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"17"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10215","display_name":"Semantic Web and Ontologies","score":0.9923999905586243,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9853000044822693,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.591722309589386},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3417189419269562},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3387465178966522}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.591722309589386},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3417189419269562},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3387465178966522}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3689904.3694702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3689904.3694702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3689904.3694702","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3689904.3694702","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 4th ACM Conference on Equity and Access in Algorithms, Mechanisms, and Optimization","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8","score":0.47999998927116394}],"awards":[{"id":"https://openalex.org/G1255754416","display_name":null,"funder_award_id":"CNS-2205171, CCF-2045402, GRFP","funder_id":"https://openalex.org/F4320323817","funder_display_name":"Universitas Brawijaya"}],"funders":[{"id":"https://openalex.org/F4320323817","display_name":"Universitas Brawijaya","ror":"https://ror.org/01wk3d929"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":61,"referenced_works":["https://openalex.org/W2100960835","https://openalex.org/W2106652774","https://openalex.org/W2115072199","https://openalex.org/W2122651256","https://openalex.org/W2130850880","https://openalex.org/W2336188181","https://openalex.org/W2578321341","https://openalex.org/W2791170418","https://openalex.org/W2791213089","https://openalex.org/W2896457183","https://openalex.org/W2949678053","https://openalex.org/W3037831233","https://openalex.org/W3038611379","https://openalex.org/W3044564311","https://openalex.org/W3101447603","https://openalex.org/W3103585759","https://openalex.org/W3108655343","https://openalex.org/W3133702157","https://openalex.org/W3135367836","https://openalex.org/W3136672479","https://openalex.org/W3166873126","https://openalex.org/W3174220540","https://openalex.org/W3189849087","https://openalex.org/W3194157648","https://openalex.org/W3207830467","https://openalex.org/W3207941483","https://openalex.org/W3213241618","https://openalex.org/W4206070857","https://openalex.org/W4225109361","https://openalex.org/W4225591000","https://openalex.org/W4250212716","https://openalex.org/W4282026609","https://openalex.org/W4288058287","https://openalex.org/W4288083800","https://openalex.org/W4291220917","https://openalex.org/W4298036549","https://openalex.org/W4306820534","https://openalex.org/W4307020339","https://openalex.org/W4307106676","https://openalex.org/W4312282373","https://openalex.org/W4312933868","https://openalex.org/W4319654043","https://openalex.org/W4361805813","https://openalex.org/W4367365797","https://openalex.org/W4377010286","https://openalex.org/W4379959055","https://openalex.org/W4385270437","https://openalex.org/W4385449797","https://openalex.org/W4385573090","https://openalex.org/W4386065512","https://openalex.org/W4386246854","https://openalex.org/W4386246859","https://openalex.org/W4386249234","https://openalex.org/W4388514288","https://openalex.org/W4388858734","https://openalex.org/W4393321314","https://openalex.org/W4402672040","https://openalex.org/W4402727334","https://openalex.org/W6778883912","https://openalex.org/W6881145156","https://openalex.org/W6913058775"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"As":[0],"training":[1],"datasets":[2,34],"become":[3],"increasingly":[4,19],"drawn":[5],"from":[6,203,258],"unstructured,":[7],"uncontrolled":[8],"environments":[9],"such":[10,136],"as":[11,87,137],"the":[12,28,41,61,102,161,196,213,225,250,305],"web,":[13],"researchers":[14],"and":[15,43,77,122,142,260,284,294],"industry":[16],"practitioners":[17],"have":[18,35],"relied":[20],"upon":[21],"data":[22,72,128,169,202],"filtering":[23,73,111,285],"techniques":[24,115],"to":[25,39,53,64,130,167,171,175,234,254,275,304],"\u201cfilter":[26],"out":[27],"noise\u201d":[29],"of":[30,45,57,83,98,110,119,151,157,177,184,266],"web-scraped":[31],"data.":[32,89],"While":[33],"been":[36],"widely":[37],"shown":[38],"reflect":[40],"biases":[42,76],"values":[44],"their":[46],"creators,":[47],"in":[48,195,212,224,281],"this":[49],"paper":[50,290],"we":[51,93,181,247],"contribute":[52],"an":[54],"emerging":[55],"body":[56],"research":[58],"that":[59,70,127,176,249,261,297],"assesses":[60],"filters":[62,231],"used":[63],"create":[65],"these":[66,204],"datasets.":[67],"We":[68,125,153],"show":[69,248],"image-text":[71,99],"also":[74,154],"has":[75],"is":[78,85,164],"value-laden,":[79],"encoding":[80],"specific":[81],"notions":[82],"what":[84],"counted":[86],"\u201chigh-quality\u201d":[88],"In":[90],"our":[91],"work,":[92],"audit":[94],"a":[95,236,276],"standard":[96],"approach":[97],"CLIP-filtering":[100,200,262],"on":[101],"academic":[103],"benchmark":[104],"DataComp\u2019s":[105],"CommonPool":[106],"by":[107],"analyzing":[108],"discrepancies":[109],"through":[112],"various":[113],"annotation":[114],"across":[116],"multiple":[117],"modalities":[118],"image,":[120],"text,":[121],"website":[123],"source.":[124],"find":[126,155],"relating":[129],"several":[131,264],"imputed":[132],"demographic":[133],"groups":[134,192,205],"\u2014":[135,145],"LGBTQ+":[138],"people,":[139],"older":[140],"women,":[141],"younger":[143],"men":[144],"are":[146,189,232],"associated":[147],"with":[148],"higher":[149,207],"rates":[150],"exclusion.":[152],"prevalence":[156],"Western":[158,172],"bias,":[159],"where":[160],"CLIP":[162],"filter":[163,252],"more":[165],"likely":[166],"include":[168],"related":[170],"countries":[173],"compared":[174],"non-Western":[178],"countries.":[179],"Moreover,":[180],"demonstrate":[182],"cases":[183],"exclusion":[185],"amplification:":[186],"not":[187],"only":[188],"certain":[190],"marginalized":[191],"already":[193,222],"underrepresented":[194],"unfiltered":[197],"data,":[198],"but":[199],"excludes":[201],"at":[206,269],"rates.":[208,271],"The":[209],"data-filtering":[210],"step":[211],"machine":[214],"learning":[215],"pipeline":[216],"can":[217],"therefore":[218],"exacerbate":[219],"representation":[220],"disparities":[221],"present":[223],"data-gathering":[226],"step,":[227],"especially":[228],"when":[229],"existing":[230],"designed":[233],"optimize":[235],"specifically-chosen":[237],"downstream":[238],"performance":[239],"metric":[240],"like":[241],"zero-shot":[242],"image":[243],"classification":[244],"accuracy.":[245],"Finally,":[246],"NSFW":[251],"fails":[253],"remove":[255],"sexually-explicit":[256,295],"content":[257,268],"CommonPool,":[259],"includes":[263],"categories":[265],"copyrighted":[267],"high":[270],"Our":[272],"conclusions":[273],"point":[274],"need":[277],"for":[278],"fundamental":[279],"changes":[280],"dataset":[282],"creation":[283],"practices.":[286],"Content":[287],"warning:":[288],"This":[289],"discusses":[291],"societal":[292],"stereotypes":[293],"material":[296],"may":[298],"be":[299],"disturbing,":[300],"distressing,":[301],"and/or":[302],"offensive":[303],"reader.":[306]},"counts_by_year":[{"year":2026,"cited_by_count":4},{"year":2025,"cited_by_count":5}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
