2025
Miller, Catriona J; Golovina, Evgenija; Gokuladhas, Sreemol; Wicker, Jörg; Jacobson, Jessie C; O'Sullivan, Justin M
Unraveling ADHD: genes, co-occurring traits, and developmental dynamics Journal Article
In: Life Science Alliance, vol. 8, no. 5, 2025.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bioinformatics, Biological Sciences, biomarkers, computational sustainability, machine learning
@article{miller2025unraveling,
title = {Unraveling ADHD: genes, co-occurring traits, and developmental dynamics},
author = {Catriona J Miller and Evgenija Golovina and Sreemol Gokuladhas and J\"{o}rg Wicker and Jessie C Jacobson and Justin M O\'Sullivan},
doi = {10.26508/lsa.202403029},
year = {2025},
date = {2025-02-25},
journal = {Life Science Alliance},
volume = {8},
number = {5},
abstract = {Attention-deficit/hyperactivity disorder (ADHD) is a heterogeneous neurodevelopmental condition with a high prevalence of co-occurring conditions, contributing to increased difficulty in long-term management. Genome-wide association studies have identified variants shared between ADHD and co-occurring psychiatric disorders; however, the genetic mechanisms are not fully understood. We integrated gene expression and spatial organization data into a two-sample Mendelian randomization study for putatively causal ADHD genes in fetal and adult cortical tissues. We identified four genes putatively causal for ADHD in cortical tissues (fetal: ST3GAL3, PTPRF, PIDD1; adult: ST3GAL3, TIE1). Protein{textendash}protein interaction databases seeded with the causal ADHD genes identified biological pathways linking these genes with conditions (e.g., rheumatoid arthritis) and biomarkers (e.g., lymphocyte counts) known to be associated with ADHD, but without previously shown genetic relationships. The analysis was repeated on adult liver tissue, where putatively causal ADHD gene ST3GAL3 was linked to cholesterol traits. This analysis provides insight into the tissue-dependent temporal relationships between ADHD, co-occurring traits, and biomarkers. Importantly, it delivers evidence for the genetic interplay between co-occurring conditions, both previously studied and unstudied, with ADHD.The multimorbid3D pipeline was created and run in Python (version 3.8.8). All visualizations and data analysis were performed in R (version 4.2.0) through RStudio (version 2022.02.2). Table S16 lists the datasets and software that have been used in our analyses. All scripts are available on GitHub (https://github.com/Catriona-Miller/ADHD_Co-occurring_Traits).Table S16. Software and datasets used for this analysis.Ethics statementEthics approval was obtained from the University of Auckland Human Participants Ethics Committee (Decoding SNPs in context, UAHPEC19373).},
keywords = {bioinformatics, Biological Sciences, biomarkers, computational sustainability, machine learning},
pubstate = {published},
tppubtype = {article}
}
Brydon, Liam; Zhang, Kunyang; Dobbie, Gillian; Taskova, Katerina; Wicker, Jörg
Predictive Modeling of Biodegradation Pathways Using Transformer Architectures Journal Article
In: Journal of Cheminformatics, vol. 17, no. 1, pp. 21, 2025, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, enviPath, machine learning
@article{Brydon2024b,
title = {Predictive Modeling of Biodegradation Pathways Using Transformer Architectures},
author = {Liam Brydon and Kunyang Zhang and Gillian Dobbie and Katerina Taskova and J\"{o}rg Wicker},
url = { https://doi.org/10.21203/rs.3.rs-5200860/v3},
doi = {10.1186/s13321-025-00969-7},
issn = {1758-2946},
year = {2025},
date = {2025-02-17},
urldate = {2024-10-24},
journal = {Journal of Cheminformatics},
volume = {17},
number = {1},
pages = {21},
abstract = {In recent years, the integration of machine learning techniques into chemical reaction product prediction has opened new avenues for understanding and predicting the behaviour of chemical substances. The necessity for such predictive methods stems from the growing regulatory and social awareness of the environmental consequences associated with the persistence and accumulation of chemical residues. Traditional biodegradation prediction methods rely on expert knowledge to perform predictions. However, creating this expert knowledge is becoming increasingly prohibitive due to the complexity and diversity of newer datasets, leaving existing methods unable to perform predictions on these datasets. We formulate the product prediction problem as a sequence-to-sequence generation task and take inspiration from natural language processing and other reaction prediction tasks. In doing so, we reduce the need for the expensive manual creation of expert-based rules. },
howpublished = {ResearchSquare},
keywords = {biodegradation, cheminformatics, enviPath, machine learning},
pubstate = {published},
tppubtype = {article}
}
Park, Sean; Wicker, Jörg; Dost, Katharina
Resource-Constrained Binary Image Classification Proceedings Article
In: Pedreschi, Dino; Monreale, Anna; Pellungrini, Roberto; Naretto, Francesca (Ed.): Discovery Science, pp. 215-230, Springer Nature Switzerland, Cham, 2025, ISBN: 978-3-031-78980-9.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: machine learning
@inproceedings{park2024resource,
title = {Resource-Constrained Binary Image Classification},
author = {Sean Park and J\"{o}rg Wicker and Katharina Dost },
editor = {Dino Pedreschi and Anna Monreale and Roberto Pellungrini and Francesca Naretto},
doi = {10.1007/978-3-031-78980-9_14},
isbn = {978-3-031-78980-9},
year = {2025},
date = {2025-01-28},
urldate = {2024-09-30},
booktitle = {Discovery Science},
pages = {215-230},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Deep convolutional neural networks (CNNs) have achieved state-of-the-art performance in image classification tasks by automatically learning discriminative features from raw pixel data. However, their success often relies on large labeled training datasets and substantial computational resources, which can be limiting in resource-constrained scenarios. This study explores alternative, lightweight approaches. In particular, we compare a lightweight CNN with a combination of randomly initialized convolutional layers with an ensemble of weak learners in a stacking framework for binary image classification. This method aims to leverage the feature extraction capabilities of convolutional layers while mitigating the need for large datasets and intensive computations. Extensive experiments on seven datasets show that under resource constraints, the decision as to which model to use is not straightforward and depends on a practitioner\'s prioritization of predictive performance vs. training and prediction time vs. memory requirements.},
keywords = {machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Muraoka, Kohji; Ausseil, Anne-Gaelle; Benavidez, Rubianca; Blue, Brendan; Coland, Nic; Daughney, Chris; Semadeni-Davies, Annette; Hoang, Linh; Hooper, Anna; Kpodonu, Theodore Alfred; Marapara, Tapuwa; McDowell, Richard W.; Nguyen, Trung; Nguyet, Dang Anh; Norton, Ned; Özkundakci, Deniz; Pearson, Lisa; Rolinson, James; Smith, Ra; Stephens, Tom; Tamepo, Reina; Taylor, Ken; van Uitregt, Vincent; Jackson, Bethanna; Sarris, Theo; Elliott, Alexander; Wicker, Jörg
Freshwater Quality Modeling in Aotearoa New Zealand: Current Practice and Future Directions Unpublished Forthcoming
SSRN, Forthcoming.
Links | BibTeX | Altmetric | PlumX | Tags: best practice, Catchment modeling process, machine learning, model trustworthiness, Modelling platform design, reliable machine learning, root-cause analysis, water quality
@unpublished{dost2025freshwater,
title = {Freshwater Quality Modeling in Aotearoa New Zealand: Current Practice and Future Directions},
author = {Katharina Dost and Kohji Muraoka and Anne-Gaelle Ausseil and Rubianca Benavidez and Brendan Blue and Nic Coland and Chris Daughney and Annette Semadeni-Davies and Linh Hoang and Anna Hooper and Theodore Alfred Kpodonu and Tapuwa Marapara and Richard W. McDowell and Trung Nguyen and Dang Anh Nguyet and Ned Norton and Deniz \"{O}zkundakci and Lisa Pearson and James Rolinson and Ra Smith and Tom Stephens and Reina Tamepo and Ken Taylor and Vincent van Uitregt and Bethanna Jackson and Theo Sarris and Alexander Elliott and J\"{o}rg Wicker },
doi = {10.2139/ssrn.5105393},
year = {2025},
date = {2025-01-21},
urldate = {2025-01-21},
journal = {SSRN},
howpublished = {SSRN},
keywords = {best practice, Catchment modeling process, machine learning, model trustworthiness, Modelling platform design, reliable machine learning, root-cause analysis, water quality},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
2024
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Enabling Asymmetric Knowledge Transfer in Multi-Task Learning with Self-Auxiliaries Unpublished Forthcoming
Arxiv, Forthcoming.
Abstract | Links | BibTeX | Tags: machine learning, multi-task learning
@unpublished{graffeuille2024enabling,
title = {Enabling Asymmetric Knowledge Transfer in Multi-Task Learning with Self-Auxiliaries},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann},
url = {https://arxiv.org/abs/2410.15875},
year = {2024},
date = {2024-10-21},
urldate = {2024-10-21},
abstract = {Knowledge transfer in multi-task learning is typically viewed as a dichotomy; positive transfer, which improves the performance of all tasks, or negative transfer, which hinders the performance of all tasks. In this paper, we investigate the understudied problem of asymmetric task relationships, where knowledge transfer aids the learning of certain tasks while hindering the learning of others. We propose an optimisation strategy that includes additional cloned tasks named self-auxiliaries into the learning process to flexibly transfer knowledge between tasks asymmetrically. Our method can exploit asymmetric task relationships, benefiting from the positive transfer component while avoiding the negative transfer component. We demonstrate that asymmetric knowledge transfer provides substantial improvements in performance compared to existing multi-task optimisation strategies on benchmark computer vision problems.},
howpublished = {Arxiv},
keywords = {machine learning, multi-task learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Albrecht, Steffen; Broderick, David; Dost, Katharina; Cheung, Isabella; Nghiem, Nhung; Wu, Milton; Zhu, Johnny; Poonawala-Lohani, Nooriyan; Jamison, Sarah; Rasanathan, Damayanthi; Huang, Sue; Trenholme, Adrian; Stanley, Alicia; Lawrence, Shirley; Marsh, Samantha; Castelino, Lorraine; Paynter, Janine; Turner, Nikki; McIntyre, Peter; Riddle, Pat; Grant, Cameron; Dobbie, Gillian; Wicker, Jörg
Forecasting severe respiratory disease hospitalizations using machine learning algorithms Journal Article
In: BMC Medical Informatics and Decision Making, vol. 24, iss. 1, pp. 293, 2024, ISSN: 1472-6947.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: influenza, machine learning, time series, time series forecasting
@article{Albrecht2024forecasting,
title = {Forecasting severe respiratory disease hospitalizations using machine learning algorithms},
author = {Steffen Albrecht and David Broderick and Katharina Dost and Isabella Cheung and Nhung Nghiem and Milton Wu and Johnny Zhu and Nooriyan Poonawala-Lohani and Sarah Jamison and Damayanthi Rasanathan and Sue Huang and Adrian Trenholme and Alicia Stanley and Shirley Lawrence and Samantha Marsh and Lorraine Castelino and Janine Paynter and Nikki Turner and Peter McIntyre and Pat Riddle and Cameron Grant and Gillian Dobbie and J\"{o}rg Wicker},
url = {https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-024-02702-0},
doi = {10.1186/s12911-024-02702-0},
issn = {1472-6947},
year = {2024},
date = {2024-10-09},
urldate = {2024-10-07},
journal = {BMC Medical Informatics and Decision Making},
volume = {24},
issue = {1},
pages = {293},
abstract = {Forecasting models predicting trends in hospitalization rates have the potential to inform hospital management during seasonal epidemics of respiratory diseases and the associated surges caused by acute hospital admissions. Hospital bed requirements for elective surgery could be better planned if it were possible to foresee upcoming peaks in severe respiratory illness admissions. Forecasting models can also guide the use of intervention strategies to decrease the spread of respiratory pathogens and thus prevent local health system overload. In this study, we explore the capability of forecasting models to predict the number of hospital admissions in Auckland, New Zealand, within a three-week time horizon. Furthermore, we evaluate probabilistic forecasts and the impact on model performance when integrating laboratory data describing the circulation of respiratory viruses.},
keywords = {influenza, machine learning, time series, time series forecasting},
pubstate = {published},
tppubtype = {article}
}
Hua, Yan Cathy; Denny, Paul; Wicker, Jörg; Taskova, Katerina
A Systematic Review of Aspect-based Sentiment Analysis: Domains, Methods, and Trends Journal Article
In: Artificial Intelligence Review, vol. 57, no. 11, pp. 296, 2024, ISSN: 1573-7462.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: aspect-based sentiment analysis, machine learning, nlp, review
@article{hua2023systematic,
title = {A Systematic Review of Aspect-based Sentiment Analysis: Domains, Methods, and Trends},
author = {Yan Cathy Hua and Paul Denny and J\"{o}rg Wicker and Katerina Taskova},
url = {https://link.springer.com/article/10.1007/s10462-024-10906-z
https://arxiv.org/abs/2311.10777},
doi = {10.1007/s10462-024-10906-z},
issn = {1573-7462},
year = {2024},
date = {2024-09-17},
urldate = {2023-11-17},
journal = {Artificial Intelligence Review},
volume = {57},
number = {11},
pages = {296},
abstract = {Aspect-based sentiment analysis (ABSA) is a fine-grained type of sentiment analysis that identifies aspects and their associated opinions from a given text. With the surge of digital opinionated text data, ABSA gained increasing popularity for its ability to mine more detailed and targeted insights. Many review papers on ABSA subtasks and solution methodologies exist, however, few focus on trends over time or systemic issues relating to research application domains, datasets, and solution approaches. To fill the gap, this paper presents a systematic literature review (SLR) of ABSA studies with a focus on trends and high-level relationships among these fundamental components. This review is one of the largest SLRs on ABSA. To our knowledge, it is also the first to systematically examine the interrelations among ABSA research and data distribution across domains, as well as trends in solution paradigms and approaches. Our sample includes 727 primary studies screened from 8550 search results without time constraints via an innovative automatic filtering process. Our quantitative analysis not only identifies trends in nearly two decades of ABSA research development but also unveils a systemic lack of dataset and domain diversity as well as domain mismatch that may hinder the development of future ABSA research. We discuss these findings and their implications and propose suggestions for future research.},
keywords = {aspect-based sentiment analysis, machine learning, nlp, review},
pubstate = {published},
tppubtype = {article}
}
Hafner, Jasmin; Lorsbach, Tim; Schmidt, Sebastian; Brydon, Liam; Dost, Katharina; Zhang, Kunyang; Fenner, Kathrin; Wicker, Jörg
Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath Journal Article
In: Journal of Cheminformatics, vol. 16, no. 1, pp. 93, 2024, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: applicability domain, biodegradation, bioinformatics, cheminformatics, computational sustainability, enviPath, linked data, machine learning, multi-label classification, Process-based modeling
@article{hafner2023advancements,
title = {Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath},
author = {Jasmin Hafner and Tim Lorsbach and Sebastian Schmidt and Liam Brydon and Katharina Dost and Kunyang Zhang and Kathrin Fenner and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-024-00881-6
https://envipath.org},
doi = {10.1186/s13321-024-00881-6},
issn = {1758-2946},
year = {2024},
date = {2024-08-06},
urldate = {2024-08-06},
journal = {Journal of Cheminformatics},
volume = {16},
number = {1},
pages = {93},
abstract = {enviPath is a widely used database and prediction system for microbial biotransformation pathways of primarily xenobiotic compounds. Data and prediction system are freely available both via a web interface and a public REST API. Since its initial release in 2016, we extended the data available in enviPath and improved the performance of the prediction system and usability of the overall system. We now provide three diverse data sets, covering microbial biotransformation in different environments and under different experimental conditions. This also enabled developing a pathway prediction model that is applicable to a more diverse set of chemicals. In the prediction engine, we implemented a new evaluation tailored towards pathway prediction, which returns a more honest and holistic view on the performance. We also implemented a novel applicability domain algorithm, which allows the user to estimate how well the model will perform on their data. Finally, we improved the implementation to speed up the overall system and provide new functionality via a plugin system.
},
keywords = {applicability domain, biodegradation, bioinformatics, cheminformatics, computational sustainability, enviPath, linked data, machine learning, multi-label classification, Process-based modeling},
pubstate = {published},
tppubtype = {article}
}
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Remote Sensing for Water Quality: A Multi-Task, Metadata-Driven Hypernetwork Approach Proceedings Article
In: Larson, Kate (Ed.): Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24), pp. Pages 7287-7295, 2024, (AI for Good).
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, machine learning, water quality
@inproceedings{graffeuille2024remote,
title = {Remote Sensing for Water Quality: A Multi-Task, Metadata-Driven Hypernetwork Approach},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann },
editor = {Kate Larson},
doi = {10.24963/ijcai.2024/806},
year = {2024},
date = {2024-08-05},
urldate = {2024-08-05},
booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24)},
pages = {Pages 7287-7295},
abstract = {Inland water quality monitoring is vital for clean water access and aquatic ecosystem management. Remote sensing machine learning models enable large-scale observations, but are difficult to train due to data scarcity and variability across many lakes. Multi-task learning approaches enable learning of lake differences by learning multiple lake functions simultaneously. However, they suffer from a trade-off between parameter efficiency and the ability to model task differences flexibly, and struggle to model many diverse lakes with few samples per task. We propose Multi-Task Hypernetworks, a novel multi-task learning architecture which circumvents this trade-off using a shared hypernetwork to generate different network weights for each task from small task-specific embeddings. Our approach stands out from existing works by providing the added capacity to leverage task-level metadata, such as lake depth and temperature, explicitly. We show empirically that Multi-Task Hypernetworks outperform existing multi-task learning architectures for water quality remote sensing and other tabular data problems, and leverages metadata more effectively than existing methods. },
note = {AI for Good},
keywords = {computational sustainability, machine learning, water quality},
pubstate = {published},
tppubtype = {inproceedings}
}
Lyu, Jiachen; Dost, Katharina; Koh, Yun Sing; Wicker, Jörg
Regional Bias in Monolingual English Language Models Journal Article
In: Machine Learning, 2024, ISSN: 1573-0565.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, large language models, machine learning, nlp, regional bias, reliable machine learning
@article{lyu2023regional,
title = {Regional Bias in Monolingual English Language Models},
author = {Jiachen Lyu and Katharina Dost and Yun Sing Koh and J\"{o}rg Wicker},
url = {https://link.springer.com/article/10.1007/s10994-024-06555-6
https://dx.doi.org/10.21203/rs.3.rs-3713494/v1},
doi = {10.1007/s10994-024-06555-6},
issn = {1573-0565},
year = {2024},
date = {2024-07-09},
urldate = {2024-07-09},
journal = {Machine Learning},
abstract = { In Natural Language Processing (NLP), pre-trained language models (LLMs) are widely employed and refined for various tasks. These models have shown considerable social and geographic biases creating skewed or even unfair representations of certain groups. Research focuses on biases toward L2 (English as a second language) regions but neglects bias within L1 (first language) regions. In this work, we ask if there is regional bias within L1 regions already inherent in pre-trained LLMs and, if so, what the consequences are in terms of downstream model performance. We contribute an investigation framework specifically tailored for low-resource regions, offering a method to identify bias without imposing strict requirements for labeled datasets. Our research reveals subtle geographic variations in the word embeddings of BERT, even in cultures traditionally perceived as similar. These nuanced features, once captured, have the potential to significantly impact downstream tasks. Generally, models exhibit comparable performance on datasets that share similarities, and conversely, performance may diverge when datasets differ in their nuanced features embedded within the language. It is crucial to note that estimating model performance solely based on standard benchmark datasets may not necessarily apply to the datasets with distinct features from the benchmark datasets. Our proposed framework plays a pivotal role in identifying and addressing biases detected in word embeddings, particularly evident in low-resource regions such as New Zealand.},
keywords = {bias, large language models, machine learning, nlp, regional bias, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
Lorsbach, Tim; Wicker, Jörg
enviPath-python: v0.2.3 Miscellaneous
Zenedo, 2024.
Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, enviPath, machine learning
@misc{lorsbach2024envipath,
title = {enviPath-python: v0.2.3},
author = {Tim Lorsbach and J\"{o}rg Wicker},
url = {https://github.com/enviPath/enviPath-python/tree/v0.2.3},
doi = {10.5281/zenodo.10929408},
year = {2024},
date = {2024-04-05},
urldate = {2024-04-05},
howpublished = {Zenedo},
keywords = {biodegradation, cheminformatics, enviPath, machine learning},
pubstate = {published},
tppubtype = {misc}
}
Chang, Xinglong; Brydon, Liam; Wicker, Jörg
Memento: v1.1.1 Miscellaneous
Zenedo, 2024.
Links | BibTeX | Altmetric | PlumX | Tags: machine learning, reliable machine learning
@misc{chang2024memento,
title = {Memento: v1.1.1},
author = {Xinglong Chang and Liam Brydon and J\"{o}rg Wicker},
url = {https://github.com/wickerlab/memento/tree/v1.1.1},
doi = {10.5281/zenodo.10929406},
year = {2024},
date = {2024-04-05},
urldate = {2024-04-05},
howpublished = {Zenedo},
keywords = {machine learning, reliable machine learning},
pubstate = {published},
tppubtype = {misc}
}
Graffeuille, Olivier; Lehmann, Moritz; Allan, Matthew; Wicker, Jörg; Koh, Yun Sing
Lake by Lake, Globally: Enhancing Water Quality Remote Sensing with Multi-Task Learning Models Unpublished Forthcoming
Forthcoming, ISSN: 1556-5068.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: inland and coastal waters, machine learning, multi-task learning, remote sensing, water quality
@unpublished{graffeuille2024lake,
title = {Lake by Lake, Globally: Enhancing Water Quality Remote Sensing with Multi-Task Learning Models},
author = {Olivier Graffeuille and Moritz Lehmann and Matthew Allan and J\"{o}rg Wicker and Yun Sing Koh },
doi = {10.2139/ssrn.4762429},
issn = {1556-5068},
year = {2024},
date = {2024-03-17},
urldate = {2024-03-17},
abstract = {The estimation of water quality from satellite remote sensing data in inland and coastal waters is an important yet challenging problem. Recent collaborative efforts have produced large global datasets with sufficient data to train machine learning models with high accuracy. In this work, we investigate global water quality remote sensing models at the granularity of individual water bodies. We introduce Multi-Task Learning (MTL), a machine learning technique that learns a distinct model for each water body in the dataset from few data points by sharing knowledge between models. This approach allows MTL to learn water body differences, leading to more accurate predictions. We train and validate our model on the GLORIA dataset of in situ measured remote sensing reflectance and three water quality indicators: chlorophyll$a$, total suspended solids and coloured dissolved organic matter. MTL outperforms other machine learning models by 8-31% in Root Mean Squared Error (RMSE) and 12-34% in Mean Absolute Percentage Error (MAPE). Training on a smaller dataset of chlorophyll$a$ measurements from New Zealand lakes with simultaneous Sentinel-3 OLCI remote sensing reflectance further demonstrates the effectiveness of our model when applied regionally. Additionally, we investigate the performance of machine learning models at estimating the variation in water quality indicators within individual water bodies. Our results reveal that overall performance metrics overestimate the quality of model fit of models trained on a large number of water bodies due to the large between-water body variability of water quality indicators. In our experiments, when estimating TSS or CDOM, all models excluding multi-task learning fail to learn within-water body variability, and fail to outperform a naive baseline approach, suggesting that these models may be of limited usefulness to practitioners monitoring water quality. Overall, our research highlights the importance of considering water body differences in water quality remote sensing research for both model design and evaluation. },
keywords = {inland and coastal waters, machine learning, multi-task learning, remote sensing, water quality},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Kim, Jonathan; Urschler, Martin; Riddle, Pat; Wicker, Jörg
Attacking the Loop: Adversarial Attacks on Graph-based Loop Closure Detection Proceedings Article
In: Proceedings of the 19th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications, pp. 90-97, 2024.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial defence, adversarial learning, machine learning, SLAM
@inproceedings{kim2024attacking,
title = {Attacking the Loop: Adversarial Attacks on Graph-based Loop Closure Detection},
author = {Jonathan Kim and Martin Urschler and Pat Riddle and J\"{o}rg Wicker },
url = {http://arxiv.org/abs/2312.06991
https://doi.org/10.48550/arxiv.2312.06991},
doi = {10.5220/0012313100003660},
year = {2024},
date = {2024-02-27},
urldate = {2024-02-27},
booktitle = {Proceedings of the 19th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications},
volume = {4},
pages = {90-97},
abstract = {With the advancement in robotics, it is becoming increasingly common for large factories and warehouses to incorporate visual SLAM (vSLAM) enabled automated robots that operate closely next to humans. This makes any adversarial attacks on vSLAM components potentially detrimental to humans working alongside them. Loop Closure Detection (LCD) is a crucial component in vSLAM that minimizes the accumulation of drift in mapping, since even a small drift can accumulate into a significant drift over time. Previous work by Kim et al. , unified visual features and semantic objects into a single graph structure for finding loop closure candidates. While this provided a performance improvement over visual feature-based LCD, it also created a single point of vulnerability for potential graph-based adversarial attacks. Unlike previously reported visual-patch based attacks, small graph perturbations are far more challenging to detect, making them a more significant threat. In this paper, we present Adversarial-LCD, a novel black-box evasion attack framework that employs an eigencentrality-based perturbation method and an SVM-RBF surrogate model with a Weisfeiler-Lehman feature extractor for attacking graph-based LCD. Our evaluation shows that the attack performance of Adversarial-LCD was superior to that of other machine learning surrogate algorithms, including SVM-linear, SVM-polynomial, and Bayesian classifier, demonstrating the effectiveness of our attack framework. Furthermore, we show that our eigencentrality-based perturbation method outperforms other algorithms, such as Random-walk and Shortest-path, highlighting the efficiency of Adversarial-LCD’s perturbation selection method.},
keywords = {adversarial defence, adversarial learning, machine learning, SLAM},
pubstate = {published},
tppubtype = {inproceedings}
}
Long, Derek; Eade, Liam; Dost, Katharina; Meier-Menches, Samuel M; Goldstone, David C; Sullivan, Matthew P; Hartinger, Christian; Wicker, Jörg; Taskova, Katerina
AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra Journal Article
In: Journal of Cheminformatics, vol. 16, iss. 1, 2024, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry
@article{Long2023adducthunter,
title = {AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra},
author = {Derek Long and Liam Eade and Katharina Dost and Samuel M Meier-Menches and David C Goldstone and Matthew P Sullivan and Christian Hartinger and J\"{o}rg Wicker and Katerina Taskova},
url = {https://adducthunter.wickerlab.org
https://doi.org/10.21203/rs.3.rs-3322854/v1},
doi = {10.1186/s13321-023-00797-7},
issn = {1758-2946},
year = {2024},
date = {2024-02-06},
urldate = {2024-02-06},
journal = {Journal of Cheminformatics},
volume = {16},
issue = {1},
abstract = {Mass spectrometry (MS) is an analytical technique for molecule identification that can be used for investigating protein-metal complex interactions. Once the MS data is collected, the mass spectra are usually interpreted manually to identify the adducts formed as a result of the interactions between proteins and metal-based species. However, with increasing resolution, dataset size, and species complexity, the time required to identify adducts and the error-prone nature of manual assignment have become limiting factors in MS analysis. AdductHunter is a open-source web-based analysis tool that automates the peak identification process using constraint integer optimization to find feasible combinations of protein and fragments, and dynamic time warping to calculate the dissimilarity between the theoretical isotope pattern of a species and its experimental isotope peak distribution. Empirical evaluation on a collection of 22 unique MS datasetsshows fast and accurate identification of protein-metal complex adducts in deconvoluted mass spectra.},
keywords = {cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry},
pubstate = {published},
tppubtype = {article}
}
2023
Dost, Katharina; Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Defining Applicability Domain in Biodegradation Pathway Prediction Unpublished Forthcoming
Forthcoming.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: applicability domain, biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, reliable machine learning
@unpublished{dost2023defining,
title = {Defining Applicability Domain in Biodegradation Pathway Prediction},
author = {Katharina Dost and Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
doi = {https://doi.org/10.21203/rs.3.rs-3587632/v1},
year = {2023},
date = {2023-11-10},
urldate = {2023-11-10},
abstract = {When developing a new chemical, investigating its long-term influences on the environment is crucial to prevent harm. Unfortunately, these experiments are time-consuming. In silico methods can learn from already obtained data to predict biotransformation pathways, and thereby help focus all development efforts on only the most promising chemicals. As all data-based models, these predictors will output pathway predictions for all input compounds in a suitable format, however, these predictions will be faulty unless the model has seen similar compounds during the training process. A common approach to prevent this for other types of models is to define an Applicability Domain for the model that makes predictions only for in-domain compounds and rejects out-of-domain ones. Nonetheless, although exploration of the compound space is particularly interesting in the development of new chemicals, no Applicability Domain method has been tailored to the specific data structure of pathway predictions yet. In this paper, we are the first to define Applicability Domain specialized in biodegradation pathway prediction. Assessing a model’s reliability from different angles, we suggest a three-stage approach that checks for applicability, reliability, and decidability of the model for a queried compound and only allows it to output a prediction if all three stages are passed. Experiments confirm that our proposed technique reliably rejects unsuitable compounds and therefore improves the safety of the biotransformation pathway predictor. },
keywords = {applicability domain, biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, reliable machine learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Chang, Xinglong; Dost, Katharina; Dobbie, Gillian; Wicker, Jörg
Poison is Not Traceless: Fully-Agnostic Detection of Poisoning Attacks Unpublished Forthcoming
Forthcoming.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial defence, adversarial learning, machine learning, reliable machine learning
@unpublished{Chang2023poison,
title = {Poison is Not Traceless: Fully-Agnostic Detection of Poisoning Attacks },
author = {Xinglong Chang and Katharina Dost and Gillian Dobbie and J\"{o}rg Wicker},
url = {http://arxiv.org/abs/2310.16224},
doi = {10.48550/arXiv.2310.16224},
year = {2023},
date = {2023-10-23},
urldate = {2023-10-23},
abstract = {The performance of machine learning models depends on the quality of the underlying data. Malicious actors can attack the model by poisoning the training data. Current detectors are tied to either specific data types, models, or attacks, and therefore have limited applicability in real-world scenarios. This paper presents a novel fully-agnostic framework, Diva (Detecting InVisible Attacks), that detects attacks solely relying on analyzing the potentially poisoned data set. Diva is based on the idea that poisoning attacks can be detected by comparing the classifier’s accuracy on poisoned and clean data and pre-trains a meta-learner using Complexity Measures to estimate the otherwise unknown accuracy on a hypothetical clean dataset. The framework applies to generic poisoning attacks. For evaluation purposes, in this paper, we test Diva on label-flipping attacks.},
keywords = {adversarial defence, adversarial learning, machine learning, reliable machine learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Chang, Xinglong; Dobbie, Gillian; Wicker, Jörg
Fast Adversarial Label-Flipping Attack on Tabular Data Unpublished Forthcoming
Forthcoming.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial learning, machine learning, reliable machine learning
@unpublished{Chang2023fast,
title = {Fast Adversarial Label-Flipping Attack on Tabular Data},
author = {Xinglong Chang and Gillian Dobbie and J\"{o}rg Wicker},
url = {https://arxiv.org/abs/2310.10744},
doi = {10.48550/arXiv.2310.10744},
year = {2023},
date = {2023-10-16},
urldate = {2023-10-16},
abstract = {Machine learning models are increasingly used in fields that require high reliability such as cybersecurity. However, these models remain vulnerable to various attacks, among which the adversarial label-flipping attack poses significant threats. In label-flipping attacks, the adversary maliciously flips a portion of training labels to compromise the machine learning model. This paper raises significant concerns as these attacks can camouflage a highly skewed dataset as an easily solvable classification problem, often misleading machine learning practitioners into lower defenses and miscalculations of potential risks. This concern amplifies in tabular data settings, where identifying true labels requires expertise, allowing malicious label-flipping attacks to easily slip under the radar. To demonstrate this risk is inherited in the adversary\'s objective, we propose FALFA (Fast Adversarial Label-Flipping Attack), a novel efficient attack for crafting adversarial labels. FALFA is based on transforming the adversary\'s objective and employs linear programming to reduce computational complexity. Using ten real-world tabular datasets, we demonstrate FALFA\'s superior attack potential, highlighting the need for robust defenses against such threats. },
keywords = {adversarial learning, machine learning, reliable machine learning},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Chang, Luke; Dost, Katharina; Zhai, Kaiqi; Demontis, Ambra; Roli, Fabio; Dobbie, Gillian; Wicker, Jörg
BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability Proceedings Article
In: Kashima, Hisashi; Ide, Tsuyoshi; Peng, Wen-Chih (Ed.): The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 3-14, Springer Nature Switzerland, Cham, 2023, ISSN: 978-3-031-33374-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial defence, adversarial learning, applicability domain, cheminformatics, evasion attacks, machine learning
@inproceedings{chang2021baard,
title = {BAARD: Blocking Adversarial Examples by Testing for Applicability, Reliability and Decidability},
author = {Luke Chang and Katharina Dost and Kaiqi Zhai and Ambra Demontis and Fabio Roli and Gillian Dobbie and J\"{o}rg Wicker},
editor = {Hisashi Kashima and Tsuyoshi Ide and Wen-Chih Peng},
url = {https://arxiv.org/abs/2105.00495
https://github.com/wickerlab/baard},
doi = {10.1007/978-3-031-33374-3_1},
issn = {978-3-031-33374-3},
year = {2023},
date = {2023-05-27},
urldate = {2023-05-27},
booktitle = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
journal = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
pages = {3-14},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Adversarial defenses protect machine learning models from adversarial attacks, but are often tailored to one type of model or attack. The lack of information on unknown potential attacks makes detecting adversarial examples challenging. Additionally, attackers do not need to follow the rules made by the defender. To address this problem, we take inspiration from the concept of Applicability Domain in cheminformatics. Cheminformatics models struggle to make accurate predictions because only a limited number of compounds are known and available for training. Applicability Domain defines a domain based on the known compounds and rejects any unknown compound that falls outside the domain. Similarly, adversarial examples start as harmless inputs, but can be manipulated to evade reliable classification by moving outside the domain of the classifier. We are the first to identify the similarity between Applicability Domain and adversarial detection. Instead of focusing on unknown attacks, we focus on what is known, the training data. We propose a simple yet robust triple-stage data-driven framework that checks the input globally and locally, and confirms that they are coherent with the model’s output. This framework can be applied to any classification model and is not limited to specific attacks. We demonstrate these three stages work as one unit, effectively detecting various attacks, even for a white-box scenario.},
keywords = {adversarial defence, adversarial learning, applicability domain, cheminformatics, evasion attacks, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Chen, Zeyu; Dost, Katharina; Zhu, Xuan; Chang, Xinglong; Dobbie, Gillian; Wicker, Jörg
Targeted Attacks on Time Series Forecasting Proceedings Article
In: Kashima, Hisashi; Ide, Tsuyoshi; Peng, Wen-Chih (Ed.): The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 314-327, Springer Nature Switzerland, Cham, 2023, ISSN: 978-3-031-33383-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial learning, forecasting, machine learning, time series
@inproceedings{Chen2023targeted,
title = {Targeted Attacks on Time Series Forecasting},
author = {Zeyu Chen and Katharina Dost and Xuan Zhu and Xinglong Chang and Gillian Dobbie and J\"{o}rg Wicker},
editor = {Hisashi Kashima and Tsuyoshi Ide and Wen-Chih Peng},
url = {https://github.com/wickerlab/nvita},
doi = {10.1007/978-3-031-33383-5_25},
issn = {978-3-031-33383-5},
year = {2023},
date = {2023-05-26},
urldate = {2023-05-26},
booktitle = {The 27th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
pages = {314-327},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Abstract. Time Series Forecasting (TSF) is well established in domains dealing with temporal data to predict future events yielding the basis for strategic decision-making. Previous research indicated that forecasting models are vulnerable to adversarial attacks, that is, maliciously crafted perturbations of the original data with the goal of altering the model’s predictions. However, attackers targeting specific outcomes pose a substantially more severe threat as they could manipulate the model and bend it to their needs. Regardless, there is no systematic approach for targeted adversarial learning in the TSF domain yet. In this paper, we introduce targeted attacks on TSF in a systematic manner. We establish a new experimental design standard regarding attack goals and perturbation control for targeted adversarial learning on TSF. For this purpose, we present a novel indirect sparse black-box evasion attack on TSF, nVita. Additionally, we adapt the popular white-box attacks Fast Gradient Sign Method (FGSM) and Basic Iterative Method (BIM). Our experiments confirm not only that all three methods are effective but also that current state-of-the-art TSF models are indeed susceptible to attacks. These results motivate future research in this area to achieve higher reliability of forecasting models.},
keywords = {adversarial learning, forecasting, machine learning, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Stönner, Christof; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Williams, Jonathan; Kramer, Stefan
Cinema Experiments 2013 Miscellaneous
2023.
Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, cinema data mining, data mining, machine learning, smell of fear, sof
@misc{Wicker2023cinema,
title = {Cinema Experiments 2013},
author = { J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Christof St\"{o}nner and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Jonathan Williams and Stefan Kramer},
url = {https://auckland.figshare.com/articles/dataset/Cinema_Experiments_2013/22777364},
doi = {10.17608/k6.auckland.22777364.v3},
year = {2023},
date = {2023-05-23},
keywords = {atmospheric chemistry, cinema data mining, data mining, machine learning, smell of fear, sof},
pubstate = {published},
tppubtype = {misc}
}
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Cinema Experiments 2015 Miscellaneous
2023.
Links | BibTeX | Altmetric | PlumX | Tags: cinema data mining, data mining, machine learning, smell of fear, sof
@misc{St\"{o}nner2023cinema,
title = {Cinema Experiments 2015},
author = { Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
url = {https://auckland.figshare.com/articles/dataset/Cinema_Experiments_2015/22777352},
doi = {10.17608/k6.auckland.22777352.v2},
year = {2023},
date = {2023-05-23},
keywords = {cinema data mining, data mining, machine learning, smell of fear, sof},
pubstate = {published},
tppubtype = {misc}
}
Dost, Katharina; Pullar-Strecker, Zac; Brydon, Liam; Zhang, Kunyang; Hafner, Jasmin; Riddle, Pat; Wicker, Jörg
Combatting over-specialization bias in growing chemical databases Journal Article
In: Journal of Cheminformatics, vol. 15, iss. 1, pp. 53, 2023, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning
@article{Dost2023Combatting,
title = {Combatting over-specialization bias in growing chemical databases},
author = {Katharina Dost and Zac Pullar-Strecker and Liam Brydon and Kunyang Zhang and Jasmin Hafner and Pat Riddle and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00716-w
},
doi = {10.1186/s13321-023-00716-w},
issn = {1758-2946},
year = {2023},
date = {2023-05-19},
urldate = {2023-05-19},
journal = {Journal of Cheminformatics},
volume = {15},
issue = {1},
pages = {53},
abstract = {Background
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.},
keywords = {bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.
Bensemann, Joshua; Cheena, Hasnain; Huang, David Tse Jung; Broadbent, Elizabeth; Williams, Jonathan; Wicker, Jörg
From What You See to What We Smell: Linking Human Emotions to Bio-markers in Breath Journal Article
In: IEEE Transactions on Affective Computing, pp. 1-13, 2023, ISSN: 1949-3045.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biomarkers, breath analysis, cheminformatics, cinema data mining, emotional response analysis, machine learning, smell of fear
@article{bensemann2023from,
title = {From What You See to What We Smell: Linking Human Emotions to Bio-markers in Breath},
author = {Joshua Bensemann and Hasnain Cheena and David Tse Jung Huang and Elizabeth Broadbent and Jonathan Williams and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/10123109
https://doi.org/10.17608/k6.auckland.22777364
https://doi.org/10.17608/k6.auckland.22777352 },
doi = {10.1109/TAFFC.2023.3275216},
issn = {1949-3045},
year = {2023},
date = {2023-05-11},
urldate = {2023-05-11},
journal = {IEEE Transactions on Affective Computing},
pages = {1-13},
abstract = {Research has shown that the composition of breath can differ based on the human’s behavioral patterns and mental and physical states immediately before being collected. These breath-collection techniques have also been extended to observe the general processes occurring in groups of humans and can link them to what those groups are collectively experiencing. In this research, we applied machine learning techniques to the breath data collected from cinema audiences. These techniques included XGBOOST Regression, Hierarchical Clustering, and Item Basket analyses created using the Apriori algorithm. They were conducted to find associations between the biomarkers in the crowd’s breath and the movie’s audio-visual stimuli and thematic events. This analysis enabled us to directly link what the group was experiencing and their biological response to that experience. We first extracted visual and auditory features from a movie to achieve this. We compared it to the biomarkers in the crowd’s breath using regression and pattern mining techniques. Our results supported the theory that a crowd’s collective experience directly correlates to the biomarkers in the crowd’s breath. Consequently, these findings suggest that visual and auditory experiences have predictable effects on the human
body that can be monitored without requiring expensive or invasive neuroimaging techniques.},
keywords = {biomarkers, breath analysis, cheminformatics, cinema data mining, emotional response analysis, machine learning, smell of fear},
pubstate = {published},
tppubtype = {article}
}
body that can be monitored without requiring expensive or invasive neuroimaging techniques.
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning – Christchurch, New Zealand Journal Article
In: Natural Hazards and Earth System Sciences, vol. 23, no. 3, pp. 1207-1226, 2023.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, earthquakes, machine learning
@article{Roeslin2023development,
title = {Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning \textendash Christchurch, New Zealand},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://nhess.copernicus.org/articles/23/1207/2023/},
doi = {10.5194/nhess-23-1207-2023},
year = {2023},
date = {2023-03-22},
urldate = {2023-03-22},
journal = {Natural Hazards and Earth System Sciences},
volume = {23},
number = {3},
pages = {1207-1226},
abstract = {This paper presents a new framework for the seismic loss prediction of residential buildings in Christchurch, New Zealand. It employs data science techniques, geospatial tools, and machine learning (ML) trained on insurance claims data from the Earthquake Commission (EQC) collected following the 2010\textendash2011 Canterbury Earthquake Sequence (CES). The seismic loss prediction obtained from the ML model is shown to outperform the output from existing risk analysis tools for New Zealand for each of the main earthquakes of the CES. In addition to the prediction capabilities, the ML model delivered useful insights into the most important features contributing to losses during the CES. ML correctly highlighted that liquefaction significantly influenced buildings losses for the 22 February 2011 earthquake. The results are consistent with observations, engineering knowledge, and previous studies, confirming the potential of data science and ML in the analysis of insurance claims data and the development of seismic loss prediction models using empirical loss data.},
keywords = {computational sustainability, earthquakes, machine learning},
pubstate = {published},
tppubtype = {article}
}
2022
Kim, Jonathan; Urschler, Martin; Riddle, Pat; Wicker, Jörg
Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes Proceedings Article
In: 2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2022), pp. 4352-4358, 2022, ISBN: 978-1-6654-7927-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: machine learning, SLAM
@inproceedings{Kim2022closing,
title = {Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes},
author = {Jonathan Kim and Martin Urschler and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/abstract/document/9981542},
doi = {10.1109/IROS47612.2022.9981542},
isbn = {978-1-6654-7927-1},
year = {2022},
date = {2022-10-20},
urldate = {2022-10-20},
booktitle = {2022 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2022)},
pages = {4352-4358},
abstract = {In Simultaneous Localization and Mapping (SLAM), Loop Closure Detection (LCD) is essential to minimize drift when recognizing previously visited places. Visual Bag- of-Words (vBoW) has been an LCD algorithm of choice for many state-of-the-art SLAM systems. It uses a set of visual features to provide robust place recognition but fails to perceive the semantics or spatial relationship between feature points. Previous work has mainly focused on addressing these issues by combining vBoW with semantic and spatial information from objects in the scene. However, they are unable to exploit spatial information of local visual features and lack a structure that unifies semantic objects and visual features, therefore limiting the symbiosis between the two components. This paper proposes SymbioLCD2, which creates a unified graph structure to integrate semantic objects and visual features symbiotically. Our novel graph-based LCD system utilizes the unified graph structure by applying a Weisfeiler-Lehman graph kernel with temporal constraints to robustly predict loop closure candidates. Evaluation of the proposed system shows that having a unified graph structure incorporating semantic objects and visual features improves LCD prediction accuracy, illustrating that the proposed graph structure provides a strong symbiosis between these two complementary components. It also outperforms other Machine Learning algorithms - such as SVM, Decision Tree, Random Forest, Neural Network and GNN based Graph Matching Networks. Furthermore, it has shown good performance in detecting loop closure candidates earlier than state-of-the-art SLAM systems, demonstrating that extended semantic and spatial awareness from the unified graph structure significantly impacts LCD performance.},
keywords = {machine learning, SLAM},
pubstate = {published},
tppubtype = {inproceedings}
}
Pullar-Strecker, Zac; Dost, Katharina; Frank, Eibe; Wicker, Jörg
Hitting the Target: Stopping Active Learning at the Cost-Based Optimum Journal Article
In: Machine Learning, 2022, ISSN: 1573-0565.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: active learning, data labelling, machine learning, stopping criteria
@article{Pullar-Strecker2022hitting,
title = {Hitting the Target: Stopping Active Learning at the Cost-Based Optimum},
author = {Zac Pullar-Strecker and Katharina Dost and Eibe Frank and J\"{o}rg Wicker},
editor = {Yu-Feng Li and Prateek Jain},
url = {https://arxiv.org/abs/2110.03802},
doi = {10.1007/s10994-022-06253-1},
issn = {1573-0565},
year = {2022},
date = {2022-10-14},
urldate = {2022-10-14},
journal = {Machine Learning},
abstract = {Active learning allows machine learning models to be trained using fewer labels while retaining similar performance to traditional supervised learning. An active learner selects the most informative data points, requests their labels, and retrains itself. While this approach is promising, it raises the question of how to determine when the model is ‘good enough’ without the additional labels required for traditional evaluation. Previously, different stopping criteria have been proposed aiming to identify the optimal stopping point. Yet, optimality can only be expressed as a domain-dependent trade-off between accuracy and the number of labels, and no criterion is superior in all applications. As a further complication, a comparison of criteria for a particular real-world application would require practitioners to collect additional labelled data they are aiming to avoid by using active learning in the first place. This work enables practitioners to employ active learning by providing actionable recommendations for which stopping criteria are best for a given real-world scenario. We contribute the first large-scale comparison of stopping criteria for pool-based active learning, using a cost measure to quantify the accuracy/label trade-off, public implementations of all stopping criteria we evaluate, and an open-source framework for evaluating stopping criteria. Our research enables practitioners to substantially reduce labeling costs by utilizing the stopping criterion which best suits their domain.},
keywords = {active learning, data labelling, machine learning, stopping criteria},
pubstate = {published},
tppubtype = {article}
}
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains: Ensemble methods for spatio-temporal Time Series Forecasting of Influenza-like Illness Proceedings Article
In: pp. 1-7, Association for Computing Machinery, New York, NY, USA, 2022, ISBN: 9781450393867.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bioinformatics, computational sustainability, dynamic time warping, forecasting, influenza, machine learning, medicine, time series
@inproceedings{Poonawala-Lohani2022geographic,
title = {Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains: Ensemble methods for spatio-temporal Time Series Forecasting of Influenza-like Illness},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
doi = {10.1145/3535508.3545562},
isbn = {9781450393867},
year = {2022},
date = {2022-08-07},
pages = {1-7},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Flu surveillance in New Zealand tracks case counts from various District health boards (DHBs) in the country to monitor the spread of influenza in different geographic locations. Many factors contribute to the spread of the influenza across a geographic region, and it can be challenging to forecast cases in one region without taking into account case numbers in another region. This paper proposes a novel ensemble method called Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains (GEO-Reach). GEO-Reach is an ensemble technique that uses a two layer approach to utilise interdependence of historical case counts between geographic regions in New Zealand. This work extends a previously published method by the authors called Randomized Ensembles of Auto-regression chains (Reach). State-of-the-art forecasting models look at studying the spread of the virus. They focus on accurate forecasting of cases for a location using historical case counts for the same location and other data sources based on human behaviour such as movement of people across cities/geographic regions. This new approach is evaluated using Influenza like illness (ILI) case counts in 7 major regions in New Zealand from the years 2015-2019 and compares its performance with other standard methods such as Dante, ARIMA, Autoregression and Random Forests. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {bioinformatics, computational sustainability, dynamic time warping, forecasting, influenza, machine learning, medicine, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation Proceedings Article
In: Proceeding of the Thirty-Sixth AAAI Conference on Artificial Intelligence, pp. 6746-6754, 2022.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: classification, computational sustainability, machine learning, semi-supervised learning
@inproceedings{graffeuille2022semi,
title = {Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann},
url = {https://ojs.aaai.org/index.php/AAAI/article/view/20630},
doi = {10.1609/aaai.v36i6.20630},
year = {2022},
date = {2022-06-28},
urldate = {2022-06-28},
booktitle = {Proceeding of the Thirty-Sixth AAAI Conference on Artificial Intelligence},
volume = {36},
number = {6},
pages = {6746-6754},
abstract = {Conditional Density Estimation (CDE) has wide-reaching applicability to various real-world problems, such as spatial density estimation and environmental modelling. CDE estimates the probability density of a random variable rather than a single value and can thus model uncertainty and inverse problems. This task is inherently more complex than regression, and many algorithms suffer from overfitting, particularly when modelled with few labelled data points. For applications where unlabelled data is abundant but labelled data is scarce, we propose Wasserstein Laplacian Regularisation, a semi-supervised learning framework that allows CDE algorithms to leverage these unlabelled data. The framework minimises an objective function which ensures that the learned model is smooth along the manifold of the underlying data, as measured by Wasserstein distance. When applying our framework to Mixture Density Networks, the resulting semi-supervised algorithm can achieve similar performance to a supervised model with up to three times as many labelled data points on baseline datasets. We additionally apply our technique to the problem of remote sensing for chlorophyll-a estimation in inland waters.},
keywords = {classification, computational sustainability, machine learning, semi-supervised learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Duncanson, Hamish; Ziogas, Ioannis; Riddle, Pat; Wicker, Jörg
Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias Proceedings Article
In: 26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022), pp. 149-160, Springer-Verlag, Berlin, Heidelberg, 2022, ISBN: 978-3-031-05935-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, clustering, machine learning
@inproceedings{dost2022divide,
title = {Divide and Imitate: Multi-Cluster Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Hamish Duncanson and Ioannis Ziogas and Pat Riddle and J\"{o}rg Wicker},
url = {https://link.springer.com/chapter/10.1007/978-3-031-05936-0_12
https://github.com/KatDost/Mimic
https://pypi.org/project/imitatebias},
doi = {10.1007/978-3-031-05936-0_12},
isbn = {978-3-031-05935-3},
year = {2022},
date = {2022-05-16},
urldate = {2022-05-16},
booktitle = {26th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD2022)},
pages = {149-160},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
abstract = {Machine Learning can help overcome human biases in decision making by focusing on purely logical conclusions based on the training data. If the training data is biased, however, that bias will be transferred to the model and remains undetected as the performance is validated on a test set drawn from the same biased distribution. Existing strategies for selection bias identification and mitigation generally rely on some sort of knowledge of the bias or the ground-truth. An exception is the Imitate algorithm that assumes no knowledge but comes with a strong limitation: It can only model datasets with one normally distributed cluster per class. In this paper, we introduce a novel algorithm, Mimic, which uses Imitate as a building block but relaxes this limitation. By allowing mixtures of multivariate Gaussians, our technique is able to model multi-cluster datasets and provide solutions for a substantially wider set of problems. Experiments confirm that Mimic not only identifies potential biases in multi-cluster datasets which can be corrected early on but also improves classifier performance.},
keywords = {bias, clustering, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Kim, Jonathan; Urschler, Martin; Riddle, Pat; Wicker, Jörg
SymbioLCD – Datasets Miscellaneous
data set, 2022.
Links | BibTeX | Altmetric | PlumX | Tags: machine learning, SLAM
@misc{Kim2022,
title = {SymbioLCD - Datasets},
author = {Jonathan Kim and Martin Urschler and Pat Riddle and J\"{o}rg Wicker},
url = {https://auckland.figshare.com/articles/dataset/SymbioLCD_-_Datasets/14958228},
doi = {10.17608/k6.auckland.14958228.v1},
year = {2022},
date = {2022-01-18},
urldate = {2022-01-18},
howpublished = {data set},
keywords = {machine learning, SLAM},
pubstate = {published},
tppubtype = {misc}
}
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method Proceedings Article
In: Altman, Russ; Dunker, Keith; Hunter, Lawrence; Ritchie, Marylyn; Murray, Tiffany; Klein, Teri (Ed.): Pacific Symposium on Biocomputing, pp. 301-312, 2022.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, forecasting, influenza, machine learning, time series
@inproceedings{poonawala-lohani2022novel,
title = {A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
editor = {Russ Altman and Keith Dunker and Lawrence Hunter and Marylyn Ritchie and Tiffany Murray and Teri Klein},
url = {https://www.worldscientific.com/doi/abs/10.1142/9789811250477_0028
http://psb.stanford.edu/psb-online/proceedings/psb22/poorawala-lohani.pdf},
doi = {10.1142/9789811250477_0028},
year = {2022},
date = {2022-01-03},
urldate = {2022-01-03},
booktitle = {Pacific Symposium on Biocomputing},
volume = {27},
pages = {301-312},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Due to its huge threat to the community, accurate forecasting of Influenza-like-illness (ILI) can diminish the impact of an influenza season by enabling early public health interventions. Current forecasting models are limited in their performance, particularly when using a longer forecasting window. To support better forecasts over a longer forecasting window, we propose to use additional features such as weather data. Commonly used methods to fore-cast ILI, including statistical methods such as ARIMA, limit prediction performance when using additional data sources that might have complex non-linear associations with ILI incidence. This paper proposes a novel time series forecasting method, Randomized Ensembles of Auto-regression chains (Reach). Reach implements an ensemble of random chains for multi-step time series forecasting. This new approach is evaluated on ILI case counts in Auckland, New Zealand from the years 2015-2018 and compared to other standard methods. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {computational sustainability, forecasting, influenza, machine learning, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Kim, Jonathan; Urschler, Martin; Riddle, Pat; Wicker, Jörg
SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words Proceedings Article
In: 2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 5425-5425, 2021.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: machine learning, SLAM
@inproceedings{kim2021symbiolcd,
title = {SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words},
author = {Jonathan Kim and Martin Urschler and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/abstract/document/9636622
http://arxiv.org/abs/2110.11491},
doi = {10.1109/IROS51168.2021.9636622},
year = {2021},
date = {2021-09-27},
urldate = {2021-09-27},
booktitle = {2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
pages = {5425-5425},
abstract = {Loop closure detection is an essential tool of Simultaneous Localization and Mapping (SLAM) to minimize drift in its localization. Many state-of-the-art loop closure detection (LCD) algorithms use visual Bag-of-Words (vBoW), which is robust against partial occlusions in a scene but cannot perceive the semantics or spatial relationships between feature points. CNN object extraction can address those issues, by providing semantic labels and spatial relationships between objects in a scene. Previous work has mainly focused on replacing vBoW with CNN derived features.
In this paper we propose SymbioLCD, a novel ensemble-based LCD that utilizes both CNN-extracted objects and vBoW features for LCD candidate prediction. When used in tandem, the added elements of object semantics and spatial-awareness creates a more robust and symbiotic loop closure detection system. The proposed SymbioLCD uses scale-invariant spatial and semantic matching, Hausdorff distance with temporal constraints, and a Random Forest that utilizes combined information from both CNN-extracted objects and vBoW features for predicting accurate loop closure candidates. Evaluation of the proposed method shows it outperforms other Machine Learning (ML) algorithms - such as SVM, Decision Tree and Neural Network, and demonstrates that there is a strong symbiosis between CNN-extracted object information and vBoW features which assists accurate LCD candidate prediction. Furthermore, it is able to perceive loop closure candidates earlier than state-of-the-art SLAM algorithms, utilizing added spatial and semantic information from CNN-extracted objects.},
keywords = {machine learning, SLAM},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper we propose SymbioLCD, a novel ensemble-based LCD that utilizes both CNN-extracted objects and vBoW features for LCD candidate prediction. When used in tandem, the added elements of object semantics and spatial-awareness creates a more robust and symbiotic loop closure detection system. The proposed SymbioLCD uses scale-invariant spatial and semantic matching, Hausdorff distance with temporal constraints, and a Random Forest that utilizes combined information from both CNN-extracted objects and vBoW features for predicting accurate loop closure candidates. Evaluation of the proposed method shows it outperforms other Machine Learning (ML) algorithms – such as SVM, Decision Tree and Neural Network, and demonstrates that there is a strong symbiosis between CNN-extracted object information and vBoW features which assists accurate LCD candidate prediction. Furthermore, it is able to perceive loop closure candidates earlier than state-of-the-art SLAM algorithms, utilizing added spatial and semantic information from CNN-extracted objects.
Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products Journal Article
In: Journal of Cheminformatics, vol. 13, no. 1, pp. 63, 2021.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways
@article{tam2021holisticb,
title = {Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products},
author = {Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00543-x
https://chemrxiv.org/articles/preprint/Holistic_Evaluation_of_Biodegradation_Pathway_Prediction_Assessing_Multi-Step_Reactions_and_Intermediate_Products/14315963
https://dx.doi.org/10.26434/chemrxiv.14315963},
doi = {10.1186/s13321-021-00543-x},
year = {2021},
date = {2021-09-03},
urldate = {2021-09-03},
journal = {Journal of Cheminformatics},
volume = {13},
number = {1},
pages = {63},
abstract = {The prediction of metabolism and biotransformation pathways of xenobiotics is a highly desired tool in environmental sciences, drug discovery, and (eco)toxicology. Several systems predict single transformation steps or complete pathways as series of parallel and subsequent steps. Their performance is commonly evaluated on the level of a single transformation step. Such an approach cannot account for some specific challenges that are caused by specific properties of biotransformation experiments. That is, missing transformation products in the reference data that occur only in low concentrations, e.g. transient intermediates or higher-generation metabolites. Furthermore, some rule-based prediction systems evaluate the performance only based on the defined set of transformation rules. Therefore, the performance of these models cannot be directly compared. In this paper, we introduce a new evaluation framework that extends the evaluation of biotransformation prediction from single transformations to whole pathways, taking into account multiple generations of metabolites. We introduce a procedure to address transient intermediates and propose a weighted scoring system that acknowledges the uncertainty of higher-generation metabolites. We implemented this framework in enviPath and demonstrate its strict performance metrics on predictions of in vitro biotransformation and degradation of xenobiotics in soil. Our approach is model-agnostic and can be transferred to other prediction systems. It is also capable of revealing knowledge gaps in terms of incompletely defined sets of transformation rules.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {article}
}
Stepišnik, Tomaž; Škrlj, Blaž; Wicker, Jörg; Kocev, Dragi
A comprehensive comparison of molecular feature representations for use in predictive modeling Journal Article
In: Computers in Biology and Medicine, vol. 130, pp. 104197, 2021, ISSN: 0010-4825.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, molecular feature representation, toxicity
@article{stepisnik2021comprehensive,
title = {A comprehensive comparison of molecular feature representations for use in predictive modeling},
author = {Toma\v{z} Stepi\v{s}nik and Bla\v{z} \v{S}krlj and J\"{o}rg Wicker and Dragi Kocev},
url = {http://www.sciencedirect.com/science/article/pii/S001048252030528X},
doi = {10.1016/j.compbiomed.2020.104197},
issn = {0010-4825},
year = {2021},
date = {2021-03-01},
journal = {Computers in Biology and Medicine},
volume = {130},
pages = {104197},
abstract = {Machine learning methods are commonly used for predicting molecular properties to accelerate material and drug design. An important part of this process is deciding how to represent the molecules. Typically, machine learning methods expect examples represented by vectors of values, and many methods for calculating molecular feature representations have been proposed. In this paper, we perform a comprehensive comparison of different molecular features, including traditional methods such as fingerprints and molecular descriptors, and recently proposed learnable representations based on neural networks. Feature representations are evaluated on 11 benchmark datasets, used for predicting properties and measures such as mutagenicity, melting points, activity, solubility, and IC50. Our experiments show that several molecular features work similarly well over all benchmark datasets. The ones that stand out most are Spectrophores, which give significantly worse performance than other features on most datasets. Molecular descriptors from the PaDEL library seem very well suited for predicting physical properties of molecules. Despite their simplicity, MACCS fingerprints performed very well overall. The results show that learnable representations achieve competitive performance compared to expert based representations. However, task-specific representations (graph convolutions and Weave methods) rarely offer any benefits, even though they are computationally more demanding. Lastly, combining different molecular feature representations typically does not give a noticeable improvement in performance compared to individual feature representations.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, molecular feature representation, toxicity},
pubstate = {published},
tppubtype = {article}
}
2020
Chester, Andrew; Koh, Yun Sing; Wicker, Jörg; Sun, Quan; Lee, Junjae
Balancing Utility and Fairness against Privacy in Medical Data Proceedings Article
In: IEEE Symposium Series on Computational Intelligence (SSCI), pp. 1226-1233, IEEE, 2020.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy
@inproceedings{chester2020balancing,
title = {Balancing Utility and Fairness against Privacy in Medical Data},
author = {Andrew Chester and Yun Sing Koh and J\"{o}rg Wicker and Quan Sun and Junjae Lee},
url = {https://ieeexplore.ieee.org/abstract/document/9308226},
doi = {10.1109/SSCI47803.2020.9308226},
year = {2020},
date = {2020-12-01},
booktitle = {IEEE Symposium Series on Computational Intelligence (SSCI)},
pages = {1226-1233},
publisher = {IEEE},
abstract = {There are numerous challenges when designing algorithms that interact with sensitive data, such as, medical or financial records. One of these challenges is privacy. However, there is a tension between privacy, utility (model accuracy), and fairness. While de-identification techniques, such as generalisation and suppression, have been proposed to enable privacy protection, it comes with a cost, specifically to fairness and utility. Recent work on fairness in algorithm design defines fairness as a guarantee of similar outputs for "similar" input data. This notion is discussed in connection to de-identification. This research investigates the trade-off between privacy, fairness, and utility. In contrast, other work investigates the trade-off between privacy and utility of the data or accuracy of the model overall. In this research, we investigate the effects of two standard de-identification techniques, k-anonymity and differential privacy, on both utility and fairness. We propose two measures to calculate the trade-off between privacy-utility and privacy-fairness. Although other research has provided guarantees for privacy regarding utility, this research focuses on the trade-offs given set de-identification levels and relies on guarantees provided by the privacy preservation methods. We discuss the effects of de-identification on data of different characteristics, class imbalance and outcome imbalance. We evaluated this is on synthetic datasets and standard real-world datasets. As a case study, we analysed the Medical Expenditure Panel Survey dataset.},
keywords = {accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Taskova, Katerina; Riddle, Pat; Wicker, Jörg
Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias Proceedings Article
In: 2020 IEEE International Conference on Data Mining (ICDM), pp. 996-1001, IEEE, 2020, ISSN: 2374-8486.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, data mining, fairness, machine learning
@inproceedings{dost2020your,
title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/9338355
https://github.com/KatDost/Imitate
https://pypi.org/project/imitatebias/},
doi = {10.1109/ICDM50108.2020.00115},
issn = {2374-8486},
year = {2020},
date = {2020-11-17},
urldate = {2020-11-17},
booktitle = {2020 IEEE International Conference on Data Mining (ICDM)},
pages = {996-1001},
publisher = {IEEE},
abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.},
keywords = {bias, data mining, fairness, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience Proceedings Article
In: 17th World Conference on Earthquake Engineering, 2020.
Abstract | Links | BibTeX | Tags: computational sustainability, data mining, earthquakes, machine learning
@inproceedings{roeslin2020feature,
title = {Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://www.researchgate.net/profile/Samuel_Roeslin/publication/344503593_Feature_Engineering_for_a_Seismic_Loss_Prediction_Model_using_Machine_Learning_Christchurch_Experience/links/5f7d015a92851c14bcb36ed7/Feature-Engineering-for-a-Seismic-Loss-Prediction-Model-using-Machine-Learning-Christchurch-Experience.pdf},
year = {2020},
date = {2020-09-17},
booktitle = {17th World Conference on Earthquake Engineering},
abstract = {The city of Christchurch, New Zealand experienced four major earthquakes (MW \> 5.9) and multiple aftershocks between 4 September 2010 and 23 December 2011. This series of earthquakes, commonly known as the Canterbury Earthquake Sequence (CES), induced over NZ$40 billion in total economic losses. Liquefaction alone led to building damage in 51,000 of the 140,000 residential buildings, with around 15,000 houses left unpractical to repair. Widespread damage to residential buildings highlighted the need for improved seismic prediction tools and to better understand factors influencing damage. Fortunately, due to New Zealand unique insurance setting, up to 80% of the losses were insured. Over the entire CES, insurers received more than 650,000 claims. This research project employs multi-disciplinary empirical data gathered during and prior to the CES to develop a seismic loss prediction model for residential buildings in Christchurch using machine learning. The intent is to develop a procedure for developing insights from post-earthquake data that is subjected to continuous updating, to enable identification of critical parameters affecting losses, and to apply such a model to establish priority building stock for risk mitigation measures. The following paper describes the complex data preparation process required for the application of machine learning techniques. The paper covers the production of a merged dataset with information from the Earthquake Commission (EQC) claim database, building characteristics from RiskScape, seismic demand interpolated from GeoNet strong motion records, liquefaction occurrence from the New Zealand Geotechnical Database (NZGD) and soil conditions from Land Resource Information Systems (LRIS).},
keywords = {computational sustainability, data mining, earthquakes, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Roeslin, Samuel; Ma, Quincy; Juárez-Garcia, Hugon; Gómez-Bernal, Alonso; Wicker, Jörg; Wotherspoon, Liam
A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake Journal Article
In: Earthquake Spectra, vol. 36, no. 2, pp. 314-339, 2020.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, data mining, earthquakes, machine learning
@article{roeslin2020machine,
title = {A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake},
author = {Samuel Roeslin and Quincy Ma and Hugon Ju\'{a}rez-Garcia and Alonso G\'{o}mez-Bernal and J\"{o}rg Wicker and Liam Wotherspoon},
doi = {https://doi.org/10.1177/8755293020936714},
year = {2020},
date = {2020-07-30},
journal = {Earthquake Spectra},
volume = {36},
number = {2},
pages = {314-339},
abstract = {The 2017 Puebla, Mexico, earthquake event led to significant damage in many buildings in Mexico City. In the months following the earthquake, civil engineering students conducted detailed building assessments throughout the city. They collected building damage information and structural characteristics for 340 buildings in the Mexico City urban area, with an emphasis on the Roma and Condesa neighborhoods where they assessed 237 buildings. These neighborhoods are of particular interest due to the availability of seismic records captured by nearby recording stations, and preexisting information from when the neighborhoods were affected by the 1985 Michoac\'{a}n earthquake. This article presents a case study on developing a damage prediction model using machine learning. It details a framework suitable for working with future post-earthquake observation data. Four algorithms able to perform classification tasks were trialed. Random forest, the best performing algorithm, achieves more than 65% prediction accuracy. The study of the feature importance for the random forest shows that the building location, seismic demand, and building height are the parameters that influence the model output the most.},
keywords = {computational sustainability, data mining, earthquakes, machine learning},
pubstate = {published},
tppubtype = {article}
}
2019
Jonauskaite, Domicele; Wicker, Jörg; Mohr, Chrisine; Dael, Nele; Havelka, Jelena; Papadatou-Pastou, Marietta; Zhang, Meng; Oberfeld, Daniel
A machine learning approach to quantifying the specificity of color-emotion associations and their cultural differences Journal Article
In: Royal Society Open Science, vol. 6, no. 9, pp. 190741, 2019.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: emotion, machine learning, psychology
@article{Jonauskaite2019,
title = {A machine learning approach to quantifying the specificity of color-emotion associations and their cultural differences},
author = {Domicele Jonauskaite and J\"{o}rg Wicker and Chrisine Mohr and Nele Dael and Jelena Havelka and Marietta Papadatou-Pastou and Meng Zhang and Daniel Oberfeld},
editor = {Andrew Dunn},
url = {https://royalsocietypublishing.org/doi/10.1098/rsos.190741
https://doi.org/10.1098/rsos.190741},
doi = {10.1098/rsos.190741},
year = {2019},
date = {2019-09-25},
journal = {Royal Society Open Science},
volume = {6},
number = {9},
pages = {190741},
abstract = {The link between colour and emotion and its possible similarity across cultures are questions that have not been fully resolved. Online, 711 participants from China, Germany, Greece and the UK associated 12 colour terms with 20 discrete emotion terms in their native languages. We propose a machine learning approach to quantify (a) the consistency and specificity of colour-emotion associations and (b) the degree to which they are country-specific, on the basis of the accuracy of a statistical classifier in (a) decoding the colour term evaluated on a given trial from the 20 ratings of colour-emotion associations and (b) predicting the country of origin from the 240 individual colour-emotion associations, respectively. The classifier accuracies were significantly above chance level, demonstrating that emotion associations are to some extent colour-specific and that colour-emotion associations are to some extent country-specific. A second measure of country-specificity, the in-group advantage of the colour-decoding accuracy, was detectable but relatively small (6.1%), indicating that colour-emotion associations are both universal and culture-specific. Our results show that machine learning is a promising tool when analysing complex datasets from emotion research.},
keywords = {emotion, machine learning, psychology},
pubstate = {published},
tppubtype = {article}
}
Williams, Jonathan; Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Krauter, Nicolas; Wicker, Jörg; Kramer, Stefan
What can we learn from the air chemistry of crowds? Proceedings Article
In: Hansel, Armin; Dunkl, Jürgen (Ed.): 8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications, pp. 121-123, Innsbruck University Press, Innsbruck, 2019.
Abstract | Links | BibTeX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@inproceedings{williams2019what,
title = {What can we learn from the air chemistry of crowds?},
author = {Jonathan Williams and Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Nicolas Krauter and J\"{o}rg Wicker and Stefan Kramer},
editor = {Armin Hansel and J\"{u}rgen Dunkl},
url = {https://www.ionicon.com/sites/default/files/uploads/doc/Contributions_8th-PTR-MS-Conference-2019_web.pdf#page=122},
year = {2019},
date = {2019-05-10},
booktitle = {8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications},
pages = {121-123},
publisher = {Innsbruck University Press},
address = {Innsbruck},
abstract = {Current PTR-MS technology allows hundreds of volatile trace gases in air to be measured every second at extremely low levels (parts per trillion). These instruments are often used in atmospheric research on planes and ships and even in the Amazon rainforest. Recently, we have used this technology to examine air composition changes caused by large groups of people (10,000-30,000) under real world conditions at a football match and in a movie theater. In both cases the trace gas signatures measured in ambient air are shown to reflect crowd behavior. By applying advanced data mining techniques we have shown that groups of people reproducibly respond to certain emotional stimuli (e.g. suspense and comedy) by exhaling specific trace gases. Furthermore, we explore whether this information can be used to determine the age classification of films.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Proof of concept study: Testing human volatile organic compounds as tools for age classification of films Journal Article
In: PLOS One, vol. 13, no. 10, pp. 1-14, 2018.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@article{Stonner2018,
title = {Proof of concept study: Testing human volatile organic compounds as tools for age classification of films},
author = {Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
doi = {10.1371/journal.pone.0203044},
year = {2018},
date = {2018-10-11},
journal = {PLOS One},
volume = {13},
number = {10},
pages = {1-14},
publisher = {Public Library of Science},
abstract = {Humans emit numerous volatile organic compounds (VOCs) through breath and skin. The nature and rate of these emissions are affected by various factors including emotional state. Previous measurements of VOCs and CO2 in a cinema have shown that certain chemicals are reproducibly emitted by audiences reacting to events in a particular film. Using data from films with various age classifications, we have studied the relationship between the emission of multiple VOCs and CO2 and the age classifier (0, 6, 12, and 16) with a view to developing a new chemically based and objective film classification method. We apply a random forest model built with time independent features extracted from the time series of every measured compound, and test predictive capability on subsets of all data. It was found that most compounds were not able to predict all age classifiers reliably, likely reflecting the fact that current classification is based on perceived sensibilities to many factors (e.g. incidences of violence, sex, antisocial behaviour, drug use, and bad language) rather than the visceral biological responses expressed in the data. However, promising results were found for isoprene which reliably predicted 0, 6 and 12 age classifiers for a variety of film genres and audience age groups. Therefore, isoprene emission per person might in future be a valuable aid to national classification boards, or even offer an alternative, objective, metric for rating films based on the reactions of large groups of people.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
2017
Wicker, Jörg; Kramer, Stefan
The Best Privacy Defense is a Good Privacy Offense: Obfuscating a Search Engine User’s Profile Journal Article
In: Data Mining and Knowledge Discovery, vol. 31, no. 5, pp. 1419-1443, 2017, ISSN: 1573-756X.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial learning, machine learning, personalized ads, privacy, reinforcement learning, search engines
@article{wicker2017best,
title = {The Best Privacy Defense is a Good Privacy Offense: Obfuscating a Search Engine User's Profile},
author = {J\"{o}rg Wicker and Stefan Kramer},
editor = {Kurt Driessens and Dragi Kocev and Marko Robnik-\v{S}ikonja and Myra Spiliopoulou},
url = {http://rdcu.be/tL0U},
doi = {10.1007/s10618-017-0524-z},
issn = {1573-756X},
year = {2017},
date = {2017-09-01},
journal = {Data Mining and Knowledge Discovery},
volume = {31},
number = {5},
pages = {1419-1443},
abstract = {User privacy on the internet is an important and unsolved problem. So far, no sufficient and comprehensive solution has been proposed that helps a user to protect his or her privacy while using the internet. Data are collected and assembled by numerous service providers. Solutions so far focused on the side of the service providers to store encrypted or transformed data that can be still used for analysis. This has a major flaw, as it relies on the service providers to do this. The user has no chance of actively protecting his or her privacy. In this work, we suggest a new approach, empowering the user to take advantage of the same tool the other side has, namely data mining to produce data which obfuscates the user’s profile. We apply this approach to search engine queries and use feedback of the search engines in terms of personalized advertisements in an algorithm similar to reinforcement learning to generate new queries potentially confusing the search engine. We evaluated the approach using a real-world data set. While evaluation is hard, we achieve results that indicate that it is possible to influence the user’s profile that the search engine generates. This shows that it is feasible to defend a user’s privacy from a new and more practical perspective.},
keywords = {adversarial learning, machine learning, personalized ads, privacy, reinforcement learning, search engines},
pubstate = {published},
tppubtype = {article}
}
2016
Wicker, Jörg; Fenner, Kathrin; Kramer, Stefan
A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction Book Section
In: Lässig, Jörg; Kersting, Kristian; Morik, Katharina (Ed.): Computational Sustainability, pp. 75-97, Springer International Publishing, Cham, 2016, ISBN: 978-3-319-31858-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, multi-label classification
@incollection{wicker2016ahybrid,
title = {A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Stefan Kramer},
editor = {J\"{o}rg L\"{a}ssig and Kristian Kersting and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-319-31858-5_5},
doi = {10.1007/978-3-319-31858-5_5},
isbn = {978-3-319-31858-5},
year = {2016},
date = {2016-04-21},
booktitle = {Computational Sustainability},
pages = {75-97},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {One of the main tasks in chemical industry regarding the sustainability of a product is the prediction of its environmental fate, i.e., its degradation products and pathways. Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this chapter, we propose a hybrid knowledge-based and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. Since the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice. Results from leave-one-out cross-validation show that a recall and precision of approximately 0.8 can be achieved for a subset of 13 transformation rules. The set of used rules is further extended using multi-label classification, where dependencies among the transformation rules are exploited to improve the predictions. While the results regarding recall and precision vary, the area under the ROC curve can be improved using multi-label classification. Therefore, it is possible to optimize precision without compromising recall. Recently, we integrated the presented approach into enviPath, a complete redesign and re-implementation of UM-PPS.},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {incollection}
}
Wicker, Jörg; Tyukin, Andrey; Kramer, Stefan
A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders Proceedings Article
In: Bailey, James; Khan, Latifur; Washio, Takashi; Dobbie, Gill; Huang, Zhexue Joshua; Wang, Ruili (Ed.): The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 328-340, Springer International Publishing, Switzerland, 2016, ISBN: 978-3-319-31753-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: autoencoders, label compression, machine learning, multi-label classification
@inproceedings{wicker2016nonlinear,
title = {A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders},
author = {J\"{o}rg Wicker and Andrey Tyukin and Stefan Kramer},
editor = {James Bailey and Latifur Khan and Takashi Washio and Gill Dobbie and Zhexue Joshua Huang and Ruili Wang},
url = {http://dx.doi.org/10.1007/978-3-319-31753-3_27},
doi = {10.1007/978-3-319-31753-3_27},
isbn = {978-3-319-31753-3},
year = {2016},
date = {2016-04-16},
booktitle = {The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
volume = {9651},
pages = {328-340},
publisher = {Springer International Publishing},
address = {Switzerland},
series = {Lecture Notes in Computer Science},
abstract = {Multi-label classification targets the prediction of multiple interdependent and non-exclusive binary target variables. Transformation-based algorithms transform the data set such that regular single-label algorithms can be applied to the problem. A special type of transformation-based classifiers are label compression methods, that compress the labels and then mostly use single label classifiers to predict the compressed labels. So far, there are no compression-based algorithms follow a problem transformation approach and address non-linear dependencies in the labels. In this paper, we propose a new algorithm, called Maniac (Multi-lAbel classificatioN usIng AutoenCoders), which extracts the non-linear dependencies by compressing the labels using autoencoders. We adapt the training process of autoencoders in a way to make them more suitable for a parameter optimization in the context of this algorithm. The method is evaluated on eight standard multi-label data sets. Experiments show that despite not producing a good ranking, Maniac generates a particularly good bipartition of the labels into positives and negatives. This is caused by rather strong predictions with either really high or low probability. Additionally, the algorithm seems to perform better given more labels and a higher label cardinality in the data set.},
keywords = {autoencoders, label compression, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Lorsbach, Tim; Gütlein, Martin; Schmid, Emanuel; Latino, Diogo; Kramer, Stefan; Fenner, Kathrin
enviPath – The Environmental Contaminant Biotransformation Pathway Resource Journal Article
In: Nucleic Acid Research, vol. 44, no. D1, pp. D502-D508, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification
@article{wicker2016envipath,
title = {enviPath - The Environmental Contaminant Biotransformation Pathway Resource},
author = {J\"{o}rg Wicker and Tim Lorsbach and Martin G\"{u}tlein and Emanuel Schmid and Diogo Latino and Stefan Kramer and Kathrin Fenner},
editor = {Michael Galperin},
url = {http://nar.oxfordjournals.org/content/44/D1/D502.abstract},
doi = {10.1093/nar/gkv1229},
year = {2016},
date = {2016-01-01},
journal = {Nucleic Acid Research},
volume = {44},
number = {D1},
pages = {D502-D508},
abstract = {The University of Minnesota Biocatalysis/Biodegradation Database and Pathway Prediction System (UM-BBD/PPS) has been a unique resource covering microbial biotransformation pathways of primarily xenobiotic chemicals for over 15 years. This paper introduces the successor system, enviPath (The Environmental Contaminant Biotransformation Pathway Resource), which is a complete redesign and reimplementation of UM-BBD/PPS. enviPath uses the database from the UM-BBD/PPS as a basis, extends the use of this database, and allows users to include their own data to support multiple use cases. Relative reasoning is supported for the refinement of predictions and to allow its extensions in terms of previously published, but not implemented machine learning models. User access is simplified by providing a REST API that simplifies the inclusion of enviPath into existing workflows. An RDF database is used to enable simple integration with other databases. enviPath is publicly available at https://envipath.org with free and open access to its core data.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {article}
}
2013
Wicker, Jörg
Large Classifier Systems in Bio- and Cheminformatics PhD Thesis
Technische Universität München, 2013.
Abstract | Links | BibTeX | Tags: biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity
@phdthesis{wicker2013large,
title = {Large Classifier Systems in Bio- and Cheminformatics},
author = {J\"{o}rg Wicker},
url = {http://mediatum.ub.tum.de/node?id=1165858},
year = {2013},
date = {2013-01-01},
school = {Technische Universit\"{a}t M\"{u}nchen},
abstract = {Large classifier systems are machine learning algorithms that use multiple
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.},
keywords = {biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity},
pubstate = {published},
tppubtype = {phdthesis}
}
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.
2012
Wicker, Jörg; Pfahringer, Bernhard; Kramer, Stefan
Multi-label Classification Using Boolean Matrix Decomposition Proceedings Article
In: Proceedings of the 27th Annual ACM Symposium on Applied Computing, pp. 179–186, ACM, 2012, ISBN: 978-1-4503-0857-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: associations, Boolean matrix decomposition, machine learning, multi-label classification
@inproceedings{wicker2012multi,
title = {Multi-label Classification Using Boolean Matrix Decomposition},
author = {J\"{o}rg Wicker and Bernhard Pfahringer and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10032
http://doi.acm.org/10.1145/2245276.2245311},
doi = {10.1145/2245276.2245311},
isbn = {978-1-4503-0857-1},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 27th Annual ACM Symposium on Applied Computing},
pages = {179--186},
publisher = {ACM},
series = {SAC '12},
abstract = {This paper introduces a new multi-label classifier based on Boolean matrix decomposition. Boolean matrix decomposition is used to extract, from the full label matrix, latent labels representing useful Boolean combinations of the original labels. Base level models predict latent labels, which are subsequently transformed into the actual labels by Boolean matrix multiplication with the second matrix from the decomposition. The new method is tested on six publicly available datasets with varying numbers of labels. The experimental evaluation shows that the new method works particularly well on datasets with a large number of labels and strong dependencies among them.},
keywords = {associations, Boolean matrix decomposition, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
2011
Taskova, Katerina; Korošec, Peter; Šilc, Jurij; Džeroski, Sašo
Parameter estimation with bio-inspired meta-heuristic optimization: modeling the dynamics of endocytosis Journal Article
In: BMC Systems Biology, vol. 5, iss. 1, pp. 1752-0509, 2011.
Links | BibTeX | Altmetric | PlumX | Tags: machine learning, Parameter estimation
@article{Taskova2011Parameter,
title = {Parameter estimation with bio-inspired meta-heuristic optimization: modeling the dynamics of endocytosis},
author = {Katerina Taskova and Peter Koro\v{s}ec and Jurij \v{S}ilc and Sa\v{s}o D\v{z}eroski},
doi = {10.1186/1752-0509-5-159},
year = {2011},
date = {2011-10-11},
journal = {BMC Systems Biology},
volume = {5},
issue = {1},
pages = {1752-0509},
keywords = {machine learning, Parameter estimation},
pubstate = {published},
tppubtype = {article}
}
2010
Hardy, Barry; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: Journal of Cheminformatics, vol. 2, no. 1, pp. 7, 2010, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity
@article{hardy2010collaborative,
title = {Collaborative development of predictive toxicology applications},
author = {Barry Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and J\"{o}rg Wicker and Andreas Karwath and Martin G\"{u}tlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://www.jcheminf.com/content/2/1/7},
doi = {10.1186/1758-2946-2-7},
issn = {1758-2946},
year = {2010},
date = {2010-01-01},
journal = {Journal of Cheminformatics},
volume = {2},
number = {1},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity},
pubstate = {published},
tppubtype = {article}
}