Abstract
Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.
Links
- https://ieeexplore.ieee.org/document/9338355
- https://github.com/KatDost/Imitate
- https://pypi.org/project/imitatebias/
- doi:10.1109/ICDM50108.2020.00115
BibTeX (Download)
@inproceedings{dost2020your, title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias}, author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker}, url = {https://ieeexplore.ieee.org/document/9338355 https://github.com/KatDost/Imitate https://pypi.org/project/imitatebias/}, doi = {10.1109/ICDM50108.2020.00115}, issn = {2374-8486}, year = {2020}, date = {2020-11-17}, urldate = {2020-11-17}, booktitle = {2020 IEEE International Conference on Data Mining (ICDM)}, pages = {996-1001}, publisher = {IEEE}, abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias? In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available. Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample. We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.}, keywords = {bias, data mining, fairness, machine learning}, pubstate = {published}, tppubtype = {inproceedings} }