{"id":"https://openalex.org/W4403487017","doi":"https://doi.org/10.3233/faia240855","title":"Revisiting the Dataset Bias Problem from a Statistical Perspective","display_name":"Revisiting the Dataset Bias Problem from a Statistical Perspective","publication_year":2024,"publication_date":"2024-10-16","ids":{"openalex":"https://openalex.org/W4403487017","doi":"https://doi.org/10.3233/faia240855"},"language":"en","primary_location":{"id":"doi:10.3233/faia240855","is_oa":false,"landing_page_url":"https://doi.org/10.3233/faia240855","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5001806269","display_name":"Kien Do","orcid":"https://orcid.org/0000-0002-0119-122X"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":true,"raw_author_name":"Kien Do","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101502392","display_name":"Dung Nguyen","orcid":"https://orcid.org/0000-0002-7726-7841"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Dung Nguyen","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101936199","display_name":"Hung L\u00ea","orcid":"https://orcid.org/0000-0002-3126-184X"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Hung Le","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079045166","display_name":"Thao Minh Le","orcid":"https://orcid.org/0000-0002-8089-9962"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Thao Le","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101752860","display_name":"Dang H. Nguyen","orcid":"https://orcid.org/0000-0002-9007-8690"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Dang Nguyen","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023077368","display_name":"Haripriya Harikumar","orcid":"https://orcid.org/0000-0001-9918-381X"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Haripriya Harikumar","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085471517","display_name":"Truyen Tran","orcid":"https://orcid.org/0000-0001-6531-8907"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Truyen Tran","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024215125","display_name":"Santu Rana","orcid":"https://orcid.org/0000-0003-2247-850X"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Santu Rana","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5045540854","display_name":"Svetha Venkatesh","orcid":"https://orcid.org/0000-0001-8675-6631"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Svetha Venkatesh","raw_affiliation_strings":["Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au"],"affiliations":[{"raw_affiliation_string":"Applied Artificial Intelligence Institute (A2I2), Deakin University, Australia, k.do@deakin.edu.au, dung.nguyen@deakin.edu.au, thai.le@deakin.edu.au, thao.le@deakin.edu.au, d.nguyen@deakin.edu.au, h.harikumar@deakin.edu.au, truyen.tran@deakin.edu.au, santu.rana@deakin.edu.au, svetha.venkatesh@deakin.edu.au","institution_ids":["https://openalex.org/I149704539"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5001806269"],"corresponding_institution_ids":["https://openalex.org/I149704539"],"apc_list":null,"apc_paid":null,"fwci":0.7863,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.75029878,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.24950000643730164,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.24950000643730164,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.23019999265670776,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perspective","display_name":"Perspective (graphical)","score":0.7721990346908569},{"id":"https://openalex.org/keywords/econometrics","display_name":"Econometrics","score":0.4285876750946045},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.39329537749290466},{"id":"https://openalex.org/keywords/statistics","display_name":"Statistics","score":0.39252009987831116},{"id":"https://openalex.org/keywords/data-science","display_name":"Data science","score":0.3464411199092865},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.33506375551223755},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.2715338170528412},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.25431305170059204}],"concepts":[{"id":"https://openalex.org/C12713177","wikidata":"https://www.wikidata.org/wiki/Q1900281","display_name":"Perspective (graphical)","level":2,"score":0.7721990346908569},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.4285876750946045},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.39329537749290466},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.39252009987831116},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.3464411199092865},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.33506375551223755},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2715338170528412},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.25431305170059204}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/faia240855","is_oa":false,"landing_page_url":"https://doi.org/10.3233/faia240855","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2149537132","https://openalex.org/W2018871932","https://openalex.org/W641279757","https://openalex.org/W370975646","https://openalex.org/W1670566515","https://openalex.org/W4242022592","https://openalex.org/W596972243"],"abstract_inverted_index":{"In":[0],"this":[1,87,166],"paper,":[2],"we":[3,89,136,168],"study":[4],"the":[5,15,19,22,36,52,56,98,126,154,191],"\u201cdataset":[6],"bias\u201d":[7],"problem":[8,20],"from":[9,44],"a":[10,26,31,62,66,113,138,176],"statistical":[11],"standpoint,":[12],"and":[13,30,131,144],"identify":[14],"main":[16],"cause":[17],"of":[18,51,100,193],"as":[21,49],"strong":[23],"correlation":[24,74],"between":[25,140],"class":[27],"attribute":[28,33],"u":[29],"non-class":[32],"b":[34],"in":[35,55,133,200],"input":[37],"x,":[38],"represented":[39],"by":[40,104],"p(u|b)":[41,47,160,174],"differing":[42],"significantly":[43],"p(u).":[45],"Since":[46],"appears":[48],"part":[50],"sampling":[53,109],"distributions":[54],"standard":[57],"maximum":[58],"log-likelihood":[59],"(MLL)":[60],"objective,":[61],"model":[63],"trained":[64,179],"on":[65,186],"biased":[67,177,188],"dataset":[68,93],"via":[69,95],"MLL":[70],"inherently":[71],"incorporates":[72],"such":[73],"into":[75],"its":[76],"parameters,":[77],"leading":[78],"to":[79,82,91,116,170],"poor":[80],"generalization":[81],"unbiased":[83],"test":[84],"data.":[85],"From":[86],"observation,":[88],"propose":[90,169],"mitigate":[92],"bias":[94,155],"either":[96],"weighting":[97],"objective":[99],"each":[101],"sample":[102,111],"n":[103],"1":[105,117,172],"/":[106,118,173],"p(un|bn)":[107],"or":[108],"that":[110],"with":[112,180],"weight":[114],"proportional":[115],"p(un|bn).":[119],"While":[120],"both":[121],"methods":[122],"are":[123],"statistically":[124],"equivalent,":[125],"former":[127],"proves":[128],"more":[129],"stable":[130],"effective":[132],"practice.":[134],"Additionally,":[135],"establish":[137],"connection":[139],"our":[141,148,194,204],"debiasing":[142,198],"approach":[143],"causal":[145],"reasoning,":[146],"reinforcing":[147],"method\u2019s":[149],"theoretical":[150,205],"foundation.":[151],"However,":[152],"when":[153],"label":[156],"is":[157,162],"unavailable,":[158],"computing":[159],"exactly":[161],"difficult.":[163],"To":[164],"overcome":[165],"challenge,":[167],"approximate":[171],"using":[175],"classifier":[178],"\u201cbias":[181],"amplification\u201d":[182],"losses.":[183],"Extensive":[184],"experiments":[185],"various":[187],"datasets":[189],"demonstrate":[190],"superiority":[192],"method":[195],"over":[196],"existing":[197],"techniques":[199],"most":[201],"settings,":[202],"validating":[203],"analysis.":[206]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
