2025
Park, Sean; Wicker, Jörg; Dost, Katharina
Resource-Constrained Binary Image Classification Proceedings Article
In: Pedreschi, Dino; Monreale, Anna; Pellungrini, Roberto; Naretto, Francesca (Ed.): Discovery Science, pp. 215-230, Springer Nature Switzerland, Cham, 2025, ISBN: 978-3-031-78980-9.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: machine learning
@inproceedings{park2024resource,
title = {Resource-Constrained Binary Image Classification},
author = {Sean Park and J\"{o}rg Wicker and Katharina Dost },
editor = {Dino Pedreschi and Anna Monreale and Roberto Pellungrini and Francesca Naretto},
doi = {10.1007/978-3-031-78980-9_14},
isbn = {978-3-031-78980-9},
year = {2025},
date = {2025-01-28},
urldate = {2024-09-30},
booktitle = {Discovery Science},
pages = {215-230},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Deep convolutional neural networks (CNNs) have achieved state-of-the-art performance in image classification tasks by automatically learning discriminative features from raw pixel data. However, their success often relies on large labeled training datasets and substantial computational resources, which can be limiting in resource-constrained scenarios. This study explores alternative, lightweight approaches. In particular, we compare a lightweight CNN with a combination of randomly initialized convolutional layers with an ensemble of weak learners in a stacking framework for binary image classification. This method aims to leverage the feature extraction capabilities of convolutional layers while mitigating the need for large datasets and intensive computations. Extensive experiments on seven datasets show that under resource constraints, the decision as to which model to use is not straightforward and depends on a practitioner\'s prioritization of predictive performance vs. training and prediction time vs. memory requirements.},
keywords = {machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Muraoka, Kohji; Ausseil, Anne-Gaelle; Benavidez, Rubianca; Blue, Brendan; Coland, Nic; Daughney, Chris; Semadeni-Davies, Annette; Hoang, Linh; Hooper, Anna; Kpodonu, Theodore Alfred; Marapara, Tapuwa; McDowell, Richard W.; Nguyen, Trung; Nguyet, Dang Anh; Norton, Ned; Özkundakci, Deniz; Pearson, Lisa; Rolinson, James; Smith, Ra; Stephens, Tom; Tamepo, Reina; Taylor, Ken; van Uitregt, Vincent; Jackson, Bethanna; Sarris, Theo; Elliott, Alexander; Wicker, Jörg
Freshwater Quality Modeling in Aotearoa New Zealand: Current Practice and Future Directions Unpublished Forthcoming
SSRN, Forthcoming.
Links | BibTeX | Altmetric | PlumX | Tags: best practice, Catchment modeling process, machine learning, model trustworthiness, Modelling platform design, reliable machine learning, root-cause analysis, water quality
@unpublished{dost2025freshwater,
title = {Freshwater Quality Modeling in Aotearoa New Zealand: Current Practice and Future Directions},
author = {Katharina Dost and Kohji Muraoka and Anne-Gaelle Ausseil and Rubianca Benavidez and Brendan Blue and Nic Coland and Chris Daughney and Annette Semadeni-Davies and Linh Hoang and Anna Hooper and Theodore Alfred Kpodonu and Tapuwa Marapara and Richard W. McDowell and Trung Nguyen and Dang Anh Nguyet and Ned Norton and Deniz \"{O}zkundakci and Lisa Pearson and James Rolinson and Ra Smith and Tom Stephens and Reina Tamepo and Ken Taylor and Vincent van Uitregt and Bethanna Jackson and Theo Sarris and Alexander Elliott and J\"{o}rg Wicker },
doi = {10.2139/ssrn.5105393},
year = {2025},
date = {2025-01-21},
urldate = {2025-01-21},
journal = {SSRN},
howpublished = {SSRN},
keywords = {best practice, Catchment modeling process, machine learning, model trustworthiness, Modelling platform design, reliable machine learning, root-cause analysis, water quality},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
2024
Albrecht, Steffen; Broderick, David; Dost, Katharina; Cheung, Isabella; Nghiem, Nhung; Wu, Milton; Zhu, Johnny; Poonawala-Lohani, Nooriyan; Jamison, Sarah; Rasanathan, Damayanthi; Huang, Sue; Trenholme, Adrian; Stanley, Alicia; Lawrence, Shirley; Marsh, Samantha; Castelino, Lorraine; Paynter, Janine; Turner, Nikki; McIntyre, Peter; Riddle, Pat; Grant, Cameron; Dobbie, Gillian; Wicker, Jörg
Forecasting severe respiratory disease hospitalizations using machine learning algorithms Journal Article
In: BMC Medical Informatics and Decision Making, vol. 24, iss. 1, pp. 293, 2024, ISSN: 1472-6947.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: influenza, machine learning, time series, time series forecasting
@article{Albrecht2024forecasting,
title = {Forecasting severe respiratory disease hospitalizations using machine learning algorithms},
author = {Steffen Albrecht and David Broderick and Katharina Dost and Isabella Cheung and Nhung Nghiem and Milton Wu and Johnny Zhu and Nooriyan Poonawala-Lohani and Sarah Jamison and Damayanthi Rasanathan and Sue Huang and Adrian Trenholme and Alicia Stanley and Shirley Lawrence and Samantha Marsh and Lorraine Castelino and Janine Paynter and Nikki Turner and Peter McIntyre and Pat Riddle and Cameron Grant and Gillian Dobbie and J\"{o}rg Wicker},
url = {https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-024-02702-0},
doi = {10.1186/s12911-024-02702-0},
issn = {1472-6947},
year = {2024},
date = {2024-10-09},
urldate = {2024-10-07},
journal = {BMC Medical Informatics and Decision Making},
volume = {24},
issue = {1},
pages = {293},
abstract = {Forecasting models predicting trends in hospitalization rates have the potential to inform hospital management during seasonal epidemics of respiratory diseases and the associated surges caused by acute hospital admissions. Hospital bed requirements for elective surgery could be better planned if it were possible to foresee upcoming peaks in severe respiratory illness admissions. Forecasting models can also guide the use of intervention strategies to decrease the spread of respiratory pathogens and thus prevent local health system overload. In this study, we explore the capability of forecasting models to predict the number of hospital admissions in Auckland, New Zealand, within a three-week time horizon. Furthermore, we evaluate probabilistic forecasts and the impact on model performance when integrating laboratory data describing the circulation of respiratory viruses.},
keywords = {influenza, machine learning, time series, time series forecasting},
pubstate = {published},
tppubtype = {article}
}
Hafner, Jasmin; Lorsbach, Tim; Schmidt, Sebastian; Brydon, Liam; Dost, Katharina; Zhang, Kunyang; Fenner, Kathrin; Wicker, Jörg
Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath Journal Article
In: Journal of Cheminformatics, vol. 16, no. 1, pp. 93, 2024, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: applicability domain, biodegradation, bioinformatics, cheminformatics, computational sustainability, enviPath, linked data, machine learning, multi-label classification, Process-based modeling
@article{hafner2023advancements,
title = {Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath},
author = {Jasmin Hafner and Tim Lorsbach and Sebastian Schmidt and Liam Brydon and Katharina Dost and Kunyang Zhang and Kathrin Fenner and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-024-00881-6
https://envipath.org},
doi = {10.1186/s13321-024-00881-6},
issn = {1758-2946},
year = {2024},
date = {2024-08-06},
urldate = {2024-08-06},
journal = {Journal of Cheminformatics},
volume = {16},
number = {1},
pages = {93},
abstract = {enviPath is a widely used database and prediction system for microbial biotransformation pathways of primarily xenobiotic compounds. Data and prediction system are freely available both via a web interface and a public REST API. Since its initial release in 2016, we extended the data available in enviPath and improved the performance of the prediction system and usability of the overall system. We now provide three diverse data sets, covering microbial biotransformation in different environments and under different experimental conditions. This also enabled developing a pathway prediction model that is applicable to a more diverse set of chemicals. In the prediction engine, we implemented a new evaluation tailored towards pathway prediction, which returns a more honest and holistic view on the performance. We also implemented a novel applicability domain algorithm, which allows the user to estimate how well the model will perform on their data. Finally, we improved the implementation to speed up the overall system and provide new functionality via a plugin system.
},
keywords = {applicability domain, biodegradation, bioinformatics, cheminformatics, computational sustainability, enviPath, linked data, machine learning, multi-label classification, Process-based modeling},
pubstate = {published},
tppubtype = {article}
}
Lyu, Jiachen; Dost, Katharina; Koh, Yun Sing; Wicker, Jörg
Regional Bias in Monolingual English Language Models Journal Article
In: Machine Learning, 2024, ISSN: 1573-0565.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, large language models, machine learning, nlp, regional bias, reliable machine learning
@article{lyu2023regional,
title = {Regional Bias in Monolingual English Language Models},
author = {Jiachen Lyu and Katharina Dost and Yun Sing Koh and J\"{o}rg Wicker},
url = {https://link.springer.com/article/10.1007/s10994-024-06555-6
https://dx.doi.org/10.21203/rs.3.rs-3713494/v1},
doi = {10.1007/s10994-024-06555-6},
issn = {1573-0565},
year = {2024},
date = {2024-07-09},
urldate = {2024-07-09},
journal = {Machine Learning},
abstract = { In Natural Language Processing (NLP), pre-trained language models (LLMs) are widely employed and refined for various tasks. These models have shown considerable social and geographic biases creating skewed or even unfair representations of certain groups. Research focuses on biases toward L2 (English as a second language) regions but neglects bias within L1 (first language) regions. In this work, we ask if there is regional bias within L1 regions already inherent in pre-trained LLMs and, if so, what the consequences are in terms of downstream model performance. We contribute an investigation framework specifically tailored for low-resource regions, offering a method to identify bias without imposing strict requirements for labeled datasets. Our research reveals subtle geographic variations in the word embeddings of BERT, even in cultures traditionally perceived as similar. These nuanced features, once captured, have the potential to significantly impact downstream tasks. Generally, models exhibit comparable performance on datasets that share similarities, and conversely, performance may diverge when datasets differ in their nuanced features embedded within the language. It is crucial to note that estimating model performance solely based on standard benchmark datasets may not necessarily apply to the datasets with distinct features from the benchmark datasets. Our proposed framework plays a pivotal role in identifying and addressing biases detected in word embeddings, particularly evident in low-resource regions such as New Zealand.},
keywords = {bias, large language models, machine learning, nlp, regional bias, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
Long, Derek; Eade, Liam; Dost, Katharina; Meier-Menches, Samuel M; Goldstone, David C; Sullivan, Matthew P; Hartinger, Christian; Wicker, Jörg; Taskova, Katerina
AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra Journal Article
In: Journal of Cheminformatics, vol. 16, iss. 1, 2024, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry
@article{Long2023adducthunter,
title = {AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra},
author = {Derek Long and Liam Eade and Katharina Dost and Samuel M Meier-Menches and David C Goldstone and Matthew P Sullivan and Christian Hartinger and J\"{o}rg Wicker and Katerina Taskova},
url = {https://adducthunter.wickerlab.org
https://doi.org/10.21203/rs.3.rs-3322854/v1},
doi = {10.1186/s13321-023-00797-7},
issn = {1758-2946},
year = {2024},
date = {2024-02-06},
urldate = {2024-02-06},
journal = {Journal of Cheminformatics},
volume = {16},
issue = {1},
abstract = {Mass spectrometry (MS) is an analytical technique for molecule identification that can be used for investigating protein-metal complex interactions. Once the MS data is collected, the mass spectra are usually interpreted manually to identify the adducts formed as a result of the interactions between proteins and metal-based species. However, with increasing resolution, dataset size, and species complexity, the time required to identify adducts and the error-prone nature of manual assignment have become limiting factors in MS analysis. AdductHunter is a open-source web-based analysis tool that automates the peak identification process using constraint integer optimization to find feasible combinations of protein and fragments, and dynamic time warping to calculate the dissimilarity between the theoretical isotope pattern of a species and its experimental isotope peak distribution. Empirical evaluation on a collection of 22 unique MS datasetsshows fast and accurate identification of protein-metal complex adducts in deconvoluted mass spectra.},
keywords = {cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry},
pubstate = {published},
tppubtype = {article}
}
2023
Dost, Katharina; Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Defining Applicability Domain in Biodegradation Pathway Prediction Unpublished Forthcoming
Forthcoming.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: applicability domain, biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, reliable machine learning
@unpublished{dost2023defining,
title = {Defining Applicability Domain in Biodegradation Pathway Prediction},
author = {Katharina Dost and Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
doi = {https://doi.org/10.21203/rs.3.rs-3587632/v1},
year = {2023},
date = {2023-11-10},
urldate = {2023-11-10},
abstract = {When developing a new chemical, investigating its long-term influences on the environment is crucial to prevent harm. Unfortunately, these experiments are time-consuming. In silico methods can learn from already obtained data to predict biotransformation pathways, and thereby help focus all development efforts on only the most promising chemicals. As all data-based models, these predictors will output pathway predictions for all input compounds in a suitable format, however, these predictions will be faulty unless the model has seen similar compounds during the training process. A common approach to prevent this for other types of models is to define an Applicability Domain for the model that makes predictions only for in-domain compounds and rejects out-of-domain ones. Nonetheless, although exploration of the compound space is particularly interesting in the development of new chemicals, no Applicability Domain method has been tailored to the specific data structure of pathway predictions yet. In this paper, we are the first to define Applicability Domain specialized in biodegradation pathway prediction. Assessing a model’s reliability from different angles, we suggest a three-stage approach that checks for applicability, reliability, and decidability of the model for a queried compound and only allows it to output a prediction if all three stages are passed. Experiments confirm that our proposed technique reliably rejects unsuitable compounds and therefore improves the safety of the biotransformation pathway predictor. },
keywords = {applicability domain, biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, reliable machine learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Chang, Xinglong; Dost, Katharina; Dobbie, Gillian; Wicker, Jörg
Poison is Not Traceless: Fully-Agnostic Detection of Poisoning Attacks Unpublished Forthcoming
Forthcoming.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial defence, adversarial learning, machine learning, reliable machine learning
@unpublished{Chang2023poison,
title = {Poison is Not Traceless: Fully-Agnostic Detection of Poisoning Attacks },
author = {Xinglong Chang and Katharina Dost and Gillian Dobbie and J\"{o}rg Wicker},
url = {http://arxiv.org/abs/2310.16224},
doi = {10.48550/arXiv.2310.16224},
year = {2023},
date = {2023-10-23},
urldate = {2023-10-23},
abstract = {The performance of machine learning models depends on the quality of the underlying data. Malicious actors can attack the model by poisoning the training data. Current detectors are tied to either specific data types, models, or attacks, and therefore have limited applicability in real-world scenarios. This paper presents a novel fully-agnostic framework, Diva (Detecting InVisible Attacks), that detects attacks solely relying on analyzing the potentially poisoned data set. Diva is based on the idea that poisoning attacks can be detected by comparing the classifier’s accuracy on poisoned and clean data and pre-trains a meta-learner using Complexity Measures to estimate the otherwise unknown accuracy on a hypothetical clean dataset. The framework applies to generic poisoning attacks. For evaluation purposes, in this paper, we test Diva on label-flipping attacks.},
keywords = {adversarial defence, adversarial learning, machine learning, reliable machine learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Pullar-Strecker, Zac; Chang, Xinglong; Brydon, Liam; Ziogas, Ioannis; Dost, Katharina; Wicker, Jörg
Memento: Facilitating Effortless, Efficient, and Reliable ML Experiments Proceedings Article
In: Morales, Gianmarco De Francisci; Perlich, Claudia; Ruchansky, Natali; Kourtellis, Nicolas; Baralis, Elena; Bonchi, Francesco (Ed.): Machine Learning and Knowledge Discovery in Databases: Applied Data Science and Demo Track, pp. 310-314, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-43430-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: experimental pipeline, parallel computing, reliable machine learning
@inproceedings{Pullar-Strecker2023memento,
title = {Memento: Facilitating Effortless, Efficient, and Reliable ML Experiments},
author = {Zac Pullar-Strecker and Xinglong Chang and Liam Brydon and Ioannis Ziogas and Katharina Dost and J\"{o}rg Wicker},
editor = {Gianmarco De Francisci Morales and Claudia Perlich and Natali Ruchansky and Nicolas Kourtellis and Elena Baralis and Francesco Bonchi },
url = {https://arxiv.org/abs/2304.09175
https://github.com/wickerlab/memento},
doi = {10.1007/978-3-031-43430-3_21},
isbn = {978-3-031-43430-3},
year = {2023},
date = {2023-09-17},
urldate = {2023-09-17},
booktitle = {Machine Learning and Knowledge Discovery in Databases: Applied Data Science and Demo Track},
journal = {Lecture Notes in Computer Science},
pages = {310-314},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = { Running complex sets of machine learning experiments is challenging and time-consuming due to the lack of a unified framework. This leaves researchers forced to spend time implementing necessary features such as parallelization, caching, and checkpointing themselves instead of focussing on their project. To simplify the process, in our paper, we introduce Memento, a Python package that is designed to aid researchers and data scientists in the efficient management and execution of computationally intensive experiments. Memento has the capacity to streamline any experimental pipeline by providing a straightforward configuration matrix and the ability to concurrently run experiments across multiple threads.
Code related to this paper is available at: https://github.com/wickerlab/memento.},
keywords = {experimental pipeline, parallel computing, reliable machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Code related to this paper is available at: https://github.com/wickerlab/memento.
Chang, Luke; Dost, Katharina; Zhai, Kaiqi; Demontis, Ambra; Roli, Fabio; Dobbie, Gillian; Wicker, Jörg
BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability Proceedings Article
In: Kashima, Hisashi; Ide, Tsuyoshi; Peng, Wen-Chih (Ed.): The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 3-14, Springer Nature Switzerland, Cham, 2023, ISSN: 978-3-031-33374-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial defence, adversarial learning, applicability domain, cheminformatics, evasion attacks, machine learning
@inproceedings{chang2021baard,
title = {BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability},
author = {Luke Chang and Katharina Dost and Kaiqi Zhai and Ambra Demontis and Fabio Roli and Gillian Dobbie and J\"{o}rg Wicker},
editor = {Hisashi Kashima and Tsuyoshi Ide and Wen-Chih Peng},
url = {https://arxiv.org/abs/2105.00495
https://github.com/wickerlab/baard},
doi = {10.1007/978-3-031-33374-3_1},
issn = {978-3-031-33374-3},
year = {2023},
date = {2023-05-27},
urldate = {2023-05-27},
booktitle = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
journal = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
pages = {3-14},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Adversarial defenses protect machine learning models from adversarial attacks, but are often tailored to one type of model or attack. The lack of information on unknown potential attacks makes detecting adversarial examples challenging. Additionally, attackers do not need to follow the rules made by the defender. To address this problem, we take inspiration from the concept of Applicability Domain in cheminformatics. Cheminformatics models struggle to make accurate predictions because only a limited number of compounds are known and available for training. Applicability Domain defines a domain based on the known compounds and rejects any unknown compound that falls outside the domain. Similarly, adversarial examples start as harmless inputs, but can be manipulated to evade reliable classification by moving outside the domain of the classifier. We are the first to identify the similarity between Applicability Domain and adversarial detection. Instead of focusing on unknown attacks, we focus on what is known, the training data. We propose a simple yet robust triple-stage data-driven framework that checks the input globally and locally, and confirms that they are coherent with the model’s output. This framework can be applied to any classification model and is not limited to specific attacks. We demonstrate these three stages work as one unit, effectively detecting various attacks, even for a white-box scenario.},
keywords = {adversarial defence, adversarial learning, applicability domain, cheminformatics, evasion attacks, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Zeyu; Dost, Katharina; Zhu, Xuan; Chang, Xinglong; Dobbie, Gillian; Wicker, Jörg
Targeted Attacks on Time Series Forecasting Proceedings Article
In: Kashima, Hisashi; Ide, Tsuyoshi; Peng, Wen-Chih (Ed.): The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 314-327, Springer Nature Switzerland, Cham, 2023, ISSN: 978-3-031-33383-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial learning, forecasting, machine learning, time series
@inproceedings{Chen2023targeted,
title = {Targeted Attacks on Time Series Forecasting},
author = {Zeyu Chen and Katharina Dost and Xuan Zhu and Xinglong Chang and Gillian Dobbie and J\"{o}rg Wicker},
editor = {Hisashi Kashima and Tsuyoshi Ide and Wen-Chih Peng},
url = {https://github.com/wickerlab/nvita},
doi = {10.1007/978-3-031-33383-5_25},
issn = {978-3-031-33383-5},
year = {2023},
date = {2023-05-26},
urldate = {2023-05-26},
booktitle = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
pages = {314-327},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Abstract. Time Series Forecasting (TSF) is well established in domains dealing with temporal data to predict future events yielding the basis for strategic decision-making. Previous research indicated that forecasting models are vulnerable to adversarial attacks, that is, maliciously crafted perturbations of the original data with the goal of altering the model’s predictions. However, attackers targeting specific outcomes pose a substantially more severe threat as they could manipulate the model and bend it to their needs. Regardless, there is no systematic approach for targeted adversarial learning in the TSF domain yet. In this paper, we introduce targeted attacks on TSF in a systematic manner. We establish a new experimental design standard regarding attack goals and perturbation control for targeted adversarial learning on TSF. For this purpose, we present a novel indirect sparse black-box evasion attack on TSF, nVita. Additionally, we adapt the popular white-box attacks Fast Gradient Sign Method (FGSM) and Basic Iterative Method (BIM). Our experiments confirm not only that all three methods are effective but also that current state-of-the-art TSF models are indeed susceptible to attacks. These results motivate future research in this area to achieve higher reliability of forecasting models.},
keywords = {adversarial learning, forecasting, machine learning, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Pullar-Strecker, Zac; Brydon, Liam; Zhang, Kunyang; Hafner, Jasmin; Riddle, Pat; Wicker, Jörg
Combatting over-specialization bias in growing chemical databases Journal Article
In: Journal of Cheminformatics, vol. 15, iss. 1, pp. 53, 2023, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning
@article{Dost2023Combatting,
title = {Combatting over-specialization bias in growing chemical databases},
author = {Katharina Dost and Zac Pullar-Strecker and Liam Brydon and Kunyang Zhang and Jasmin Hafner and Pat Riddle and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00716-w
},
doi = {10.1186/s13321-023-00716-w},
issn = {1758-2946},
year = {2023},
date = {2023-05-19},
urldate = {2023-05-19},
journal = {Journal of Cheminformatics},
volume = {15},
issue = {1},
pages = {53},
abstract = {Background
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.},
keywords = {bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.
2022
Pullar-Strecker, Zac; Dost, Katharina; Frank, Eibe; Wicker, Jörg
Hitting the Target: Stopping Active Learning at the Cost-Based Optimum Journal Article
In: Machine Learning, 2022, ISSN: 1573-0565.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: active learning, data labelling, machine learning, stopping criteria
@article{Pullar-Strecker2022hitting,
title = {Hitting the Target: Stopping Active Learning at the Cost-Based Optimum},
author = {Zac Pullar-Strecker and Katharina Dost and Eibe Frank and J\"{o}rg Wicker},
editor = {Yu-Feng Li and Prateek Jain},
url = {https://arxiv.org/abs/2110.03802},
doi = {10.1007/s10994-022-06253-1},
issn = {1573-0565},
year = {2022},
date = {2022-10-14},
urldate = {2022-10-14},
journal = {Machine Learning},
abstract = {Active learning allows machine learning models to be trained using fewer labels while retaining similar performance to traditional supervised learning. An active learner selects the most informative data points, requests their labels, and retrains itself. While this approach is promising, it raises the question of how to determine when the model is ‘good enough’ without the additional labels required for traditional evaluation. Previously, different stopping criteria have been proposed aiming to identify the optimal stopping point. Yet, optimality can only be expressed as a domain-dependent trade-off between accuracy and the number of labels, and no criterion is superior in all applications. As a further complication, a comparison of criteria for a particular real-world application would require practitioners to collect additional labelled data they are aiming to avoid by using active learning in the first place. This work enables practitioners to employ active learning by providing actionable recommendations for which stopping criteria are best for a given real-world scenario. We contribute the first large-scale comparison of stopping criteria for pool-based active learning, using a cost measure to quantify the accuracy/label trade-off, public implementations of all stopping criteria we evaluate, and an open-source framework for evaluating stopping criteria. Our research enables practitioners to substantially reduce labeling costs by utilizing the stopping criterion which best suits their domain.},
keywords = {active learning, data labelling, machine learning, stopping criteria},
pubstate = {published},
tppubtype = {article}
}
Dost, Katharina; Duncanson, Hamish; Ziogas, Ioannis; Riddle, Pat; Wicker, Jörg
Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias Proceedings Article
In: 26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022), pp. 149-160, Springer-Verlag, Berlin, Heidelberg, 2022, ISBN: 978-3-031-05935-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, clustering, machine learning
@inproceedings{dost2022divide,
title = {Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Hamish Duncanson and Ioannis Ziogas and Pat Riddle and J\"{o}rg Wicker},
url = {https://link.springer.com/chapter/10.1007/978-3-031-05936-0_12
https://github.com/KatDost/Mimic
https://pypi.org/project/imitatebias},
doi = {10.1007/978-3-031-05936-0_12},
isbn = {978-3-031-05935-3},
year = {2022},
date = {2022-05-16},
urldate = {2022-05-16},
booktitle = {26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022)},
pages = {149-160},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
abstract = {Machine Learning can help overcome human biases in decision making by focusing on purely logical conclusions based on the training data. If the training data is biased, however, that bias will be transferred to the model and remains undetected as the performance is validated on a test set drawn from the same biased distribution. Existing strategies for selection bias identification and mitigation generally rely on some sort of knowledge of the bias or the ground-truth. An exception is the Imitate algorithm that assumes no knowledge but comes with a strong limitation: It can only model datasets with one normally distributed cluster per class. In this paper, we introduce a novel algorithm, Mimic, which uses Imitate as a building block but relaxes this limitation. By allowing mixtures of multivariate Gaussians, our technique is able to model multi-cluster datasets and provide solutions for a substantially wider set of problems. Experiments confirm that Mimic not only identifies potential biases in multi-cluster datasets which can be corrected early on but also improves classifier performance.},
keywords = {bias, clustering, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Dost, Katharina; Taskova, Katerina; Riddle, Pat; Wicker, Jörg
Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias Proceedings Article
In: 2020 IEEE International Conference on Data Mining (ICDM), pp. 996-1001, IEEE, 2020, ISSN: 2374-8486.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, data mining, fairness, machine learning
@inproceedings{dost2020your,
title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/9338355
https://github.com/KatDost/Imitate
https://pypi.org/project/imitatebias/},
doi = {10.1109/ICDM50108.2020.00115},
issn = {2374-8486},
year = {2020},
date = {2020-11-17},
urldate = {2020-11-17},
booktitle = {2020 IEEE International Conference on Data Mining (ICDM)},
pages = {996-1001},
publisher = {IEEE},
abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.},
keywords = {bias, data mining, fairness, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.