Abstract
Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.
Links
- https://ieeexplore.ieee.org/document/9338355
- https://github.com/KatDost/Imitate
- https://pypi.org/project/imitatebias/
- doi:10.1109/ICDM50108.2020.00115
BibTeX (Download)
@inproceedings{dost2020your,
title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/9338355
https://github.com/KatDost/Imitate
https://pypi.org/project/imitatebias/},
doi = {10.1109/ICDM50108.2020.00115},
issn = {2374-8486},
year = {2020},
date = {2020-11-17},
urldate = {2020-11-17},
booktitle = {2020 IEEE International Conference on Data Mining (ICDM)},
pages = {996-1001},
publisher = {IEEE},
abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.},
keywords = {bias, data mining, fairness, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}

