2025
Dai, Kejun; Kim, Jonathan; Džeroski, Sašo; Wicker, Jörg; Dobbie, Gillian; Dost, Katharina
Assessing the risk of discriminatory bias in classification datasets Journal Article
In: Machine Learning, vol. 114, iss. 9, pp. 204, 2025, ISSN: 1573-0565.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, fairness, machine learning, meta-learning, reliable machine learning
@article{Dai2025assessing,
title = {Assessing the risk of discriminatory bias in classification datasets},
author = {Kejun Dai and Jonathan Kim and Sa\v{s}o D\v{z}eroski and J\"{o}rg Wicker and Gillian Dobbie and Katharina Dost},
doi = {10.1007/s10994-025-06843-9},
issn = {1573-0565},
year = {2025},
date = {2025-08-06},
journal = {Machine Learning},
volume = {114},
issue = {9},
pages = {204},
abstract = {Bias in machine learning models remains a critical challenge, particularly in datasets with numeric features where discrimination may be subtle and hard to detect. Existing fairness frameworks rely on expert knowledge of marginalized groups, such as specific racial groups, and categorical features defining them. Furthermore, most frameworks evaluate bias in models rather than datasets, despite the fact that model bias can often be traced back to dataset shortcomings. Our research aims to remedy this gap by capturing dataset flaws in a set of meta-features at the dataset level, and to warn practitioners of bias risk when using such datasets for model training. We neither restrict the feature type nor expect domain knowledge. To this end, we develop methods to synthesize biased datasets and extend current fairness metrics to continuous features in order to quantify dataset-level discrimination risks. Our approach constructs a meta-database of diverse datasets, from which we derive transferable meta-features that capture dataset properties indicative of bias risk. Our findings demonstrate that dataset-level characteristics can serve as cost-effective indicators of bias risk, providing a novel method for data auditing that does not rely on expert knowledge. This work lays the foundation for early-warning systems, moving beyond model-focused assessments toward a data-centric approach.},
keywords = {bias, fairness, machine learning, meta-learning, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
2020
Chester, Andrew; Koh, Yun Sing; Wicker, Jörg; Sun, Quan; Lee, Junjae
Balancing Utility and Fairness against Privacy in Medical Data Proceedings Article
In: IEEE Symposium Series on Computational Intelligence (SSCI), pp. 1226-1233, IEEE, 2020.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy
@inproceedings{chester2020balancing,
title = {Balancing Utility and Fairness against Privacy in Medical Data},
author = {Andrew Chester and Yun Sing Koh and J\"{o}rg Wicker and Quan Sun and Junjae Lee},
url = {https://ieeexplore.ieee.org/abstract/document/9308226},
doi = {10.1109/SSCI47803.2020.9308226},
year = {2020},
date = {2020-12-01},
booktitle = {IEEE Symposium Series on Computational Intelligence (SSCI)},
pages = {1226-1233},
publisher = {IEEE},
abstract = {There are numerous challenges when designing algorithms that interact with sensitive data, such as, medical or financial records. One of these challenges is privacy. However, there is a tension between privacy, utility (model accuracy), and fairness. While de-identification techniques, such as generalisation and suppression, have been proposed to enable privacy protection, it comes with a cost, specifically to fairness and utility. Recent work on fairness in algorithm design defines fairness as a guarantee of similar outputs for "similar" input data. This notion is discussed in connection to de-identification. This research investigates the trade-off between privacy, fairness, and utility. In contrast, other work investigates the trade-off between privacy and utility of the data or accuracy of the model overall. In this research, we investigate the effects of two standard de-identification techniques, k-anonymity and differential privacy, on both utility and fairness. We propose two measures to calculate the trade-off between privacy-utility and privacy-fairness. Although other research has provided guarantees for privacy regarding utility, this research focuses on the trade-offs given set de-identification levels and relies on guarantees provided by the privacy preservation methods. We discuss the effects of de-identification on data of different characteristics, class imbalance and outcome imbalance. We evaluated this is on synthetic datasets and standard real-world datasets. As a case study, we analysed the Medical Expenditure Panel Survey dataset.},
keywords = {accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Taskova, Katerina; Riddle, Pat; Wicker, Jörg
Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias Proceedings Article
In: 2020 IEEE International Conference on Data Mining (ICDM), pp. 996-1001, IEEE, 2020, ISSN: 2374-8486.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, data mining, fairness, machine learning
@inproceedings{dost2020your,
title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/9338355
https://github.com/KatDost/Imitate
https://pypi.org/project/imitatebias/},
doi = {10.1109/ICDM50108.2020.00115},
issn = {2374-8486},
year = {2020},
date = {2020-11-17},
urldate = {2020-11-17},
booktitle = {2020 IEEE International Conference on Data Mining (ICDM)},
pages = {996-1001},
publisher = {IEEE},
abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.},
keywords = {bias, data mining, fairness, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.