Computational sustainability is an interdisciplinary field of sustainability research, including applied science about the research in sustainable solutions and their implementation. Machine Learning and Data Mining is at the center of this research area linking together diverse application areas such as environmental sciences, atmospheric science, agriculture, or social science.
2024
Hafner, Jasmin; Lorsbach, Tim; Schmidt, Sebastian; Brydon, Liam; Dost, Katharina; Zhang, Kunyang; Fenner, Kathrin; Wicker, Jörg
Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath Journal Article
In: vol. 16, no. 1, pp. 93, 2024, ISSN: 1758-2946.
@article{hafner2023advancements,
title = {Advancements in Biotransformation Pathway Prediction: Enhancements, Datasets, and Novel Functionalities in enviPath},
author = {Jasmin Hafner and Tim Lorsbach and Sebastian Schmidt and Liam Brydon and Katharina Dost and Kunyang Zhang and Kathrin Fenner and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-024-00881-6
https://envipath.org},
doi = {10.1186/s13321-024-00881-6},
issn = {1758-2946},
year = {2024},
date = {2024-08-06},
urldate = {2024-08-06},
volume = {16},
number = {1},
pages = {93},
abstract = {enviPath is a widely used database and prediction system for microbial biotransformation pathways of primarily xenobiotic compounds. Data and prediction system are freely available both via a web interface and a public REST API. Since its initial release in 2016, we extended the data available in enviPath and improved the performance of the prediction system and usability of the overall system. We now provide three diverse data sets, covering microbial biotransformation in different environments and under different experimental conditions. This also enabled developing a pathway prediction model that is applicable to a more diverse set of chemicals. In the prediction engine, we implemented a new evaluation tailored towards pathway prediction, which returns a more honest and holistic view on the performance. We also implemented a novel applicability domain algorithm, which allows the user to estimate how well the model will perform on their data. Finally, we improved the implementation to speed up the overall system and provide new functionality via a plugin system.
},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Remote Sensing for Water Quality: A Multi-Task, Metadata-Driven Hypernetwork Approach Proceedings Article
In: Larson, Kate (Ed.): Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24), pp. Pages 7287-7295, 2024, (AI for Good).
@inproceedings{graffeuille2024remote,
title = {Remote Sensing for Water Quality: A Multi-Task, Metadata-Driven Hypernetwork Approach},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann },
editor = {Kate Larson},
doi = {10.24963/ijcai.2024/806},
year = {2024},
date = {2024-08-05},
urldate = {2024-08-05},
booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24)},
pages = {Pages 7287-7295},
abstract = {Inland water quality monitoring is vital for clean water access and aquatic ecosystem management. Remote sensing machine learning models enable large-scale observations, but are difficult to train due to data scarcity and variability across many lakes. Multi-task learning approaches enable learning of lake differences by learning multiple lake functions simultaneously. However, they suffer from a trade-off between parameter efficiency and the ability to model task differences flexibly, and struggle to model many diverse lakes with few samples per task. We propose Multi-Task Hypernetworks, a novel multi-task learning architecture which circumvents this trade-off using a shared hypernetwork to generate different network weights for each task from small task-specific embeddings. Our approach stands out from existing works by providing the added capacity to leverage task-level metadata, such as lake depth and temperature, explicitly. We show empirically that Multi-Task Hypernetworks outperform existing multi-task learning architectures for water quality remote sensing and other tabular data problems, and leverages metadata more effectively than existing methods. },
note = {AI for Good},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Long, Derek; Eade, Liam; Dost, Katharina; Meier-Menches, Samuel M; Goldstone, David C; Sullivan, Matthew P; Hartinger, Christian; Wicker, Jörg; Taskova, Katerina
AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra Journal Article
In: Journal of Cheminformatics, vol. 16, iss. 1, 2024, ISSN: 1758-2946.
@article{Long2023adducthunter,
title = {AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra},
author = {Derek Long and Liam Eade and Katharina Dost and Samuel M Meier-Menches and David C Goldstone and Matthew P Sullivan and Christian Hartinger and J\"{o}rg Wicker and Katerina Taskova},
url = {https://adducthunter.wickerlab.org
https://doi.org/10.21203/rs.3.rs-3322854/v1},
doi = {10.1186/s13321-023-00797-7},
issn = {1758-2946},
year = {2024},
date = {2024-02-06},
urldate = {2024-02-06},
journal = {Journal of Cheminformatics},
volume = {16},
issue = {1},
abstract = {Mass spectrometry (MS) is an analytical technique for molecule identification that can be used for investigating protein-metal complex interactions. Once the MS data is collected, the mass spectra are usually interpreted manually to identify the adducts formed as a result of the interactions between proteins and metal-based species. However, with increasing resolution, dataset size, and species complexity, the time required to identify adducts and the error-prone nature of manual assignment have become limiting factors in MS analysis. AdductHunter is a open-source web-based analysis tool that automates the peak identification process using constraint integer optimization to find feasible combinations of protein and fragments, and dynamic time warping to calculate the dissimilarity between the theoretical isotope pattern of a species and its experimental isotope peak distribution. Empirical evaluation on a collection of 22 unique MS datasetsshows fast and accurate identification of protein-metal complex adducts in deconvoluted mass spectra.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2023
Dost, Katharina; Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Defining Applicability Domain in Biodegradation Pathway Prediction Unpublished Forthcoming
Forthcoming.
@unpublished{dost2023defining,
title = {Defining Applicability Domain in Biodegradation Pathway Prediction},
author = {Katharina Dost and Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
doi = {https://doi.org/10.21203/rs.3.rs-3587632/v1},
year = {2023},
date = {2023-11-10},
urldate = {2023-11-10},
abstract = {When developing a new chemical, investigating its long-term influences on the environment is crucial to prevent harm. Unfortunately, these experiments are time-consuming. In silico methods can learn from already obtained data to predict biotransformation pathways, and thereby help focus all development efforts on only the most promising chemicals. As all data-based models, these predictors will output pathway predictions for all input compounds in a suitable format, however, these predictions will be faulty unless the model has seen similar compounds during the training process. A common approach to prevent this for other types of models is to define an Applicability Domain for the model that makes predictions only for in-domain compounds and rejects out-of-domain ones. Nonetheless, although exploration of the compound space is particularly interesting in the development of new chemicals, no Applicability Domain method has been tailored to the specific data structure of pathway predictions yet. In this paper, we are the first to define Applicability Domain specialized in biodegradation pathway prediction. Assessing a model’s reliability from different angles, we suggest a three-stage approach that checks for applicability, reliability, and decidability of the model for a queried compound and only allows it to output a prediction if all three stages are passed. Experiments confirm that our proposed technique reliably rejects unsuitable compounds and therefore improves the safety of the biotransformation pathway predictor. },
keywords = {},
pubstate = {forthcoming},
tppubtype = {unpublished}
}
Dost, Katharina; Pullar-Strecker, Zac; Brydon, Liam; Zhang, Kunyang; Hafner, Jasmin; Riddle, Pat; Wicker, Jörg
Combatting over-specialization bias in growing chemical databases Journal Article
In: Journal of Cheminformatics, vol. 15, iss. 1, pp. 53, 2023, ISSN: 1758-2946.
@article{Dost2023Combatting,
title = {Combatting over-specialization bias in growing chemical databases},
author = {Katharina Dost and Zac Pullar-Strecker and Liam Brydon and Kunyang Zhang and Jasmin Hafner and Pat Riddle and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00716-w
},
doi = {10.1186/s13321-023-00716-w},
issn = {1758-2946},
year = {2023},
date = {2023-05-19},
urldate = {2023-05-19},
journal = {Journal of Cheminformatics},
volume = {15},
issue = {1},
pages = {53},
abstract = {Background
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning – Christchurch, New Zealand Journal Article
In: Natural Hazards and Earth System Sciences, vol. 23, no. 3, pp. 1207-1226, 2023.
@article{Roeslin2023development,
title = {Development of a Seismic Loss Prediction Model for Residential Buildings using Machine Learning \textendash Christchurch, New Zealand},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://nhess.copernicus.org/articles/23/1207/2023/},
doi = {10.5194/nhess-23-1207-2023},
year = {2023},
date = {2023-03-22},
urldate = {2023-03-22},
journal = {Natural Hazards and Earth System Sciences},
volume = {23},
number = {3},
pages = {1207-1226},
abstract = {This paper presents a new framework for the seismic loss prediction of residential buildings in Christchurch, New Zealand. It employs data science techniques, geospatial tools, and machine learning (ML) trained on insurance claims data from the Earthquake Commission (EQC) collected following the 2010\textendash2011 Canterbury Earthquake Sequence (CES). The seismic loss prediction obtained from the ML model is shown to outperform the output from existing risk analysis tools for New Zealand for each of the main earthquakes of the CES. In addition to the prediction capabilities, the ML model delivered useful insights into the most important features contributing to losses during the CES. ML correctly highlighted that liquefaction significantly influenced buildings losses for the 22 February 2011 earthquake. The results are consistent with observations, engineering knowledge, and previous studies, confirming the potential of data science and ML in the analysis of insurance claims data and the development of seismic loss prediction models using empirical loss data.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2022
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains: Ensemble methods for spatio-temporal Time Series Forecasting of Influenza-like Illness Proceedings Article
In: pp. 1-7, Association for Computing Machinery, New York, NY, USA, 2022, ISBN: 9781450393867.
@inproceedings{Poonawala-Lohani2022geographic,
title = {Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains: Ensemble methods for spatio-temporal Time Series Forecasting of Influenza-like Illness},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
doi = {10.1145/3535508.3545562},
isbn = {9781450393867},
year = {2022},
date = {2022-08-07},
pages = {1-7},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Flu surveillance in New Zealand tracks case counts from various District health boards (DHBs) in the country to monitor the spread of influenza in different geographic locations. Many factors contribute to the spread of the influenza across a geographic region, and it can be challenging to forecast cases in one region without taking into account case numbers in another region. This paper proposes a novel ensemble method called Geographic Ensembles of Observations using Randomised Ensembles of Autoregression Chains (GEO-Reach). GEO-Reach is an ensemble technique that uses a two layer approach to utilise interdependence of historical case counts between geographic regions in New Zealand. This work extends a previously published method by the authors called Randomized Ensembles of Auto-regression chains (Reach). State-of-the-art forecasting models look at studying the spread of the virus. They focus on accurate forecasting of cases for a location using historical case counts for the same location and other data sources based on human behaviour such as movement of people across cities/geographic regions. This new approach is evaluated using Influenza like illness (ILI) case counts in 7 major regions in New Zealand from the years 2015-2019 and compares its performance with other standard methods such as Dante, ARIMA, Autoregression and Random Forests. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graffeuille, Olivier; Koh, Yun Sing; Wicker, Jörg; Lehmann, Moritz
Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation Proceedings Article
In: Proceeding of the Thirty-Sixth AAAI Conference on Artificial Intelligence, pp. 6746-6754, 2022.
@inproceedings{graffeuille2022semi,
title = {Semi-Supervised Conditional Density Estimation with Wasserstein Laplacian Regularisation},
author = {Olivier Graffeuille and Yun Sing Koh and J\"{o}rg Wicker and Moritz Lehmann},
url = {https://ojs.aaai.org/index.php/AAAI/article/view/20630},
doi = {10.1609/aaai.v36i6.20630},
year = {2022},
date = {2022-06-28},
urldate = {2022-06-28},
booktitle = {Proceeding of the Thirty-Sixth AAAI Conference on Artificial Intelligence},
volume = {36},
number = {6},
pages = {6746-6754},
abstract = {Conditional Density Estimation (CDE) has wide-reaching applicability to various real-world problems, such as spatial density estimation and environmental modelling. CDE estimates the probability density of a random variable rather than a single value and can thus model uncertainty and inverse problems. This task is inherently more complex than regression, and many algorithms suffer from overfitting, particularly when modelled with few labelled data points. For applications where unlabelled data is abundant but labelled data is scarce, we propose Wasserstein Laplacian Regularisation, a semi-supervised learning framework that allows CDE algorithms to leverage these unlabelled data. The framework minimises an objective function which ensures that the learned model is smooth along the manifold of the underlying data, as measured by Wasserstein distance. When applying our framework to Mixture Density Networks, the resulting semi-supervised algorithm can achieve similar performance to a supervised model with up to three times as many labelled data points on baseline datasets. We additionally apply our technique to the problem of remote sensing for chlorophyll-a estimation in inland waters.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Poonawala-Lohani, Nooriyan; Riddle, Pat; Adnan, Mehnaz; Wicker, Jörg
A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method Proceedings Article
In: Altman, Russ; Dunker, Keith; Hunter, Lawrence; Ritchie, Marylyn; Murray, Tiffany; Klein, Teri (Ed.): Pacific Symposium on Biocomputing, pp. 301-312, 2022.
@inproceedings{poonawala-lohani2022novel,
title = {A Novel Approach for Time Series Forecasting of Influenza-like Illness Using a Regression Chain Method},
author = {Nooriyan Poonawala-Lohani and Pat Riddle and Mehnaz Adnan and J\"{o}rg Wicker},
editor = {Russ Altman and Keith Dunker and Lawrence Hunter and Marylyn Ritchie and Tiffany Murray and Teri Klein},
url = {https://www.worldscientific.com/doi/abs/10.1142/9789811250477_0028
http://psb.stanford.edu/psb-online/proceedings/psb22/poorawala-lohani.pdf},
doi = {10.1142/9789811250477_0028},
year = {2022},
date = {2022-01-03},
urldate = {2022-01-03},
booktitle = {Pacific Symposium on Biocomputing},
volume = {27},
pages = {301-312},
abstract = {Influenza is a communicable respiratory illness that can cause serious public health hazards. Due to its huge threat to the community, accurate forecasting of Influenza-like-illness (ILI) can diminish the impact of an influenza season by enabling early public health interventions. Current forecasting models are limited in their performance, particularly when using a longer forecasting window. To support better forecasts over a longer forecasting window, we propose to use additional features such as weather data. Commonly used methods to fore-cast ILI, including statistical methods such as ARIMA, limit prediction performance when using additional data sources that might have complex non-linear associations with ILI incidence. This paper proposes a novel time series forecasting method, Randomized Ensembles of Auto-regression chains (Reach). Reach implements an ensemble of random chains for multi-step time series forecasting. This new approach is evaluated on ILI case counts in Auckland, New Zealand from the years 2015-2018 and compared to other standard methods. The results demonstrate that the proposed method performed better than baseline methods when applied to this multi-variate time series forecasting problem.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products Journal Article
In: Journal of Cheminformatics, vol. 13, no. 1, pp. 63, 2021.
@article{tam2021holisticb,
title = {Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products},
author = {Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00543-x
https://chemrxiv.org/articles/preprint/Holistic_Evaluation_of_Biodegradation_Pathway_Prediction_Assessing_Multi-Step_Reactions_and_Intermediate_Products/14315963
https://dx.doi.org/10.26434/chemrxiv.14315963},
doi = {10.1186/s13321-021-00543-x},
year = {2021},
date = {2021-09-03},
urldate = {2021-09-03},
journal = {Journal of Cheminformatics},
volume = {13},
number = {1},
pages = {63},
abstract = {The prediction of metabolism and biotransformation pathways of xenobiotics is a highly desired tool in environmental sciences, drug discovery, and (eco)toxicology. Several systems predict single transformation steps or complete pathways as series of parallel and subsequent steps. Their performance is commonly evaluated on the level of a single transformation step. Such an approach cannot account for some specific challenges that are caused by specific properties of biotransformation experiments. That is, missing transformation products in the reference data that occur only in low concentrations, e.g. transient intermediates or higher-generation metabolites. Furthermore, some rule-based prediction systems evaluate the performance only based on the defined set of transformation rules. Therefore, the performance of these models cannot be directly compared. In this paper, we introduce a new evaluation framework that extends the evaluation of biotransformation prediction from single transformations to whole pathways, taking into account multiple generations of metabolites. We introduce a procedure to address transient intermediates and propose a weighted scoring system that acknowledges the uncertainty of higher-generation metabolites. We implemented this framework in enviPath and demonstrate its strict performance metrics on predictions of in vitro biotransformation and degradation of xenobiotics in soil. Our approach is model-agnostic and can be transferred to other prediction systems. It is also capable of revealing knowledge gaps in terms of incompletely defined sets of transformation rules.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Stepišnik, Tomaž; Škrlj, Blaž; Wicker, Jörg; Kocev, Dragi
A comprehensive comparison of molecular feature representations for use in predictive modeling Journal Article
In: Computers in Biology and Medicine, vol. 130, pp. 104197, 2021, ISSN: 0010-4825.
@article{stepisnik2021comprehensive,
title = {A comprehensive comparison of molecular feature representations for use in predictive modeling},
author = {Toma\v{z} Stepi\v{s}nik and Bla\v{z} \v{S}krlj and J\"{o}rg Wicker and Dragi Kocev},
url = {http://www.sciencedirect.com/science/article/pii/S001048252030528X},
doi = {10.1016/j.compbiomed.2020.104197},
issn = {0010-4825},
year = {2021},
date = {2021-03-01},
journal = {Computers in Biology and Medicine},
volume = {130},
pages = {104197},
abstract = {Machine learning methods are commonly used for predicting molecular properties to accelerate material and drug design. An important part of this process is deciding how to represent the molecules. Typically, machine learning methods expect examples represented by vectors of values, and many methods for calculating molecular feature representations have been proposed. In this paper, we perform a comprehensive comparison of different molecular features, including traditional methods such as fingerprints and molecular descriptors, and recently proposed learnable representations based on neural networks. Feature representations are evaluated on 11 benchmark datasets, used for predicting properties and measures such as mutagenicity, melting points, activity, solubility, and IC50. Our experiments show that several molecular features work similarly well over all benchmark datasets. The ones that stand out most are Spectrophores, which give significantly worse performance than other features on most datasets. Molecular descriptors from the PaDEL library seem very well suited for predicting physical properties of molecules. Despite their simplicity, MACCS fingerprints performed very well overall. The results show that learnable representations achieve competitive performance compared to expert based representations. However, task-specific representations (graph convolutions and Weave methods) rarely offer any benefits, even though they are computationally more demanding. Lastly, combining different molecular feature representations typically does not give a noticeable improvement in performance compared to individual feature representations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2020
Chester, Andrew; Koh, Yun Sing; Wicker, Jörg; Sun, Quan; Lee, Junjae
Balancing Utility and Fairness against Privacy in Medical Data Proceedings Article
In: IEEE Symposium Series on Computational Intelligence (SSCI), pp. 1226-1233, IEEE, 2020.
@inproceedings{chester2020balancing,
title = {Balancing Utility and Fairness against Privacy in Medical Data},
author = {Andrew Chester and Yun Sing Koh and J\"{o}rg Wicker and Quan Sun and Junjae Lee},
url = {https://ieeexplore.ieee.org/abstract/document/9308226},
doi = {10.1109/SSCI47803.2020.9308226},
year = {2020},
date = {2020-12-01},
booktitle = {IEEE Symposium Series on Computational Intelligence (SSCI)},
pages = {1226-1233},
publisher = {IEEE},
abstract = {There are numerous challenges when designing algorithms that interact with sensitive data, such as, medical or financial records. One of these challenges is privacy. However, there is a tension between privacy, utility (model accuracy), and fairness. While de-identification techniques, such as generalisation and suppression, have been proposed to enable privacy protection, it comes with a cost, specifically to fairness and utility. Recent work on fairness in algorithm design defines fairness as a guarantee of similar outputs for "similar" input data. This notion is discussed in connection to de-identification. This research investigates the trade-off between privacy, fairness, and utility. In contrast, other work investigates the trade-off between privacy and utility of the data or accuracy of the model overall. In this research, we investigate the effects of two standard de-identification techniques, k-anonymity and differential privacy, on both utility and fairness. We propose two measures to calculate the trade-off between privacy-utility and privacy-fairness. Although other research has provided guarantees for privacy regarding utility, this research focuses on the trade-offs given set de-identification levels and relies on guarantees provided by the privacy preservation methods. We discuss the effects of de-identification on data of different characteristics, class imbalance and outcome imbalance. We evaluated this is on synthetic datasets and standard real-world datasets. As a case study, we analysed the Medical Expenditure Panel Survey dataset.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience Proceedings Article
In: 17th World Conference on Earthquake Engineering, 2020.
@inproceedings{roeslin2020feature,
title = {Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://www.researchgate.net/profile/Samuel_Roeslin/publication/344503593_Feature_Engineering_for_a_Seismic_Loss_Prediction_Model_using_Machine_Learning_Christchurch_Experience/links/5f7d015a92851c14bcb36ed7/Feature-Engineering-for-a-Seismic-Loss-Prediction-Model-using-Machine-Learning-Christchurch-Experience.pdf},
year = {2020},
date = {2020-09-17},
booktitle = {17th World Conference on Earthquake Engineering},
abstract = {The city of Christchurch, New Zealand experienced four major earthquakes (MW \> 5.9) and multiple aftershocks between 4 September 2010 and 23 December 2011. This series of earthquakes, commonly known as the Canterbury Earthquake Sequence (CES), induced over NZ$40 billion in total economic losses. Liquefaction alone led to building damage in 51,000 of the 140,000 residential buildings, with around 15,000 houses left unpractical to repair. Widespread damage to residential buildings highlighted the need for improved seismic prediction tools and to better understand factors influencing damage. Fortunately, due to New Zealand unique insurance setting, up to 80% of the losses were insured. Over the entire CES, insurers received more than 650,000 claims. This research project employs multi-disciplinary empirical data gathered during and prior to the CES to develop a seismic loss prediction model for residential buildings in Christchurch using machine learning. The intent is to develop a procedure for developing insights from post-earthquake data that is subjected to continuous updating, to enable identification of critical parameters affecting losses, and to apply such a model to establish priority building stock for risk mitigation measures. The following paper describes the complex data preparation process required for the application of machine learning techniques. The paper covers the production of a merged dataset with information from the Earthquake Commission (EQC) claim database, building characteristics from RiskScape, seismic demand interpolated from GeoNet strong motion records, liquefaction occurrence from the New Zealand Geotechnical Database (NZGD) and soil conditions from Land Resource Information Systems (LRIS).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Roeslin, Samuel; Ma, Quincy; Juárez-Garcia, Hugon; Gómez-Bernal, Alonso; Wicker, Jörg; Wotherspoon, Liam
A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake Journal Article
In: Earthquake Spectra, vol. 36, no. 2, pp. 314-339, 2020.
@article{roeslin2020machine,
title = {A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake},
author = {Samuel Roeslin and Quincy Ma and Hugon Ju\'{a}rez-Garcia and Alonso G\'{o}mez-Bernal and J\"{o}rg Wicker and Liam Wotherspoon},
doi = {https://doi.org/10.1177/8755293020936714},
year = {2020},
date = {2020-07-30},
journal = {Earthquake Spectra},
volume = {36},
number = {2},
pages = {314-339},
abstract = {The 2017 Puebla, Mexico, earthquake event led to significant damage in many buildings in Mexico City. In the months following the earthquake, civil engineering students conducted detailed building assessments throughout the city. They collected building damage information and structural characteristics for 340 buildings in the Mexico City urban area, with an emphasis on the Roma and Condesa neighborhoods where they assessed 237 buildings. These neighborhoods are of particular interest due to the availability of seismic records captured by nearby recording stations, and preexisting information from when the neighborhoods were affected by the 1985 Michoac\'{a}n earthquake. This article presents a case study on developing a damage prediction model using machine learning. It details a framework suitable for working with future post-earthquake observation data. Four algorithms able to perform classification tasks were trialed. Random forest, the best performing algorithm, achieves more than 65% prediction accuracy. The study of the feature importance for the random forest shows that the building location, seismic demand, and building height are the parameters that influence the model output the most.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2019
Roeslin, Samuel; Ma, Quincy; Wicker, Jörg; Wotherspoon, Liam
Data integration for the development of a seismic loss prediction model for residential buildings in New Zealand Proceedings Article
In: Cellier, Peggy; Driessens, Kurt (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 88-100, Springer International Publishing, Cham, 2019, ISBN: 978-3-030-43887-6.
@inproceedings{roeslin2019data,
title = {Data integration for the development of a seismic loss prediction model for residential buildings in New Zealand},
author = {Samuel Roeslin and Quincy Ma and J\"{o}rg Wicker and Liam Wotherspoon},
editor = {Peggy Cellier and Kurt Driessens},
url = {https://link.springer.com/chapter/10.1007/978-3-030-43887-6_8},
doi = {10.1007/978-3-030-43887-6_8},
isbn = {978-3-030-43887-6},
year = {2019},
date = {2019-09-19},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
pages = {88-100},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {In 2010--2011, New Zealand experienced the most damaging earthquakes in its history. It led to extensive damage to Christchurch buildings, infrastructure and its surroundings; affecting commercial and residential buildings. The direct economic losses represented 20% of New Zealand's GDP in 2011. Owing to New Zealand's particular insurance structure, the insurance sector contributed to over 80% of losses for a total of more than NZ$31 billion. Amongst this, over NZ$11 billion of the losses arose from residential building claims and were covered either partially or entirely from the NZ government backed Earthquake Commission (EQC) cover insurance scheme. In the process of resolving the claims, EQC collected detailed financial loss data, post-event observations and building characteristics for each of the approximately 434,000 claims lodged following the Canterbury Earthquake sequence (CES). Added to this, the active NZ earthquake engineering community treated the event as a large scale outdoor experiment and collected extensive data on the ground shaking levels, soil conditions, and liquefaction occurrence throughout wider Christchurch. This paper discusses the necessary data preparation process preceding the development of a machine learning seismic loss model. The process draws heavily upon using Geographic Information System (GIS) techniques to aggregate relevant information from multiple databases interpolating data between categories and converting data between continuous and categorical forms. Subsequently, the database is processed, and a residential seismic loss prediction model is developed using machine learning. The aim is to develop a `grey-box' model enabling human interpretability of the decision steps.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2017
Latino, Diogo; Wicker, Jörg; Gütlein, Martin; Schmid, Emanuel; Kramer, Stefan; Fenner, Kathrin
Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data Journal Article
In: Environmental Science: Process & Impact, 2017.
@article{latino2017eawag,
title = {Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data},
author = {Diogo Latino and J\"{o}rg Wicker and Martin G\"{u}tlein and Emanuel Schmid and Stefan Kramer and Kathrin Fenner},
doi = {10.1039/C6EM00697C},
year = {2017},
date = {2017-01-01},
journal = {Environmental Science: Process \& Impact},
publisher = {The Royal Society of Chemistry},
abstract = {Developing models for the prediction of microbial biotransformation pathways and half-lives of trace organic contaminants in different environments requires as training data easily accessible and sufficiently large collections of respective biotransformation data that are annotated with metadata on study conditions. Here, we present the Eawag-Soil package, a public database that has been developed to contain all freely accessible regulatory data on pesticide degradation in laboratory soil simulation studies
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.
2016
Wicker, Jörg; Fenner, Kathrin; Kramer, Stefan
A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction Book Section
In: Lässig, Jörg; Kersting, Kristian; Morik, Katharina (Ed.): Computational Sustainability, pp. 75-97, Springer International Publishing, Cham, 2016, ISBN: 978-3-319-31858-5.
@incollection{wicker2016ahybrid,
title = {A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Stefan Kramer},
editor = {J\"{o}rg L\"{a}ssig and Kristian Kersting and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-319-31858-5_5},
doi = {10.1007/978-3-319-31858-5_5},
isbn = {978-3-319-31858-5},
year = {2016},
date = {2016-04-21},
booktitle = {Computational Sustainability},
pages = {75-97},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {One of the main tasks in chemical industry regarding the sustainability of a product is the prediction of its environmental fate, i.e., its degradation products and pathways. Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this chapter, we propose a hybrid knowledge-based and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. Since the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice. Results from leave-one-out cross-validation show that a recall and precision of approximately 0.8 can be achieved for a subset of 13 transformation rules. The set of used rules is further extended using multi-label classification, where dependencies among the transformation rules are exploited to improve the predictions. While the results regarding recall and precision vary, the area under the ROC curve can be improved using multi-label classification. Therefore, it is possible to optimize precision without compromising recall. Recently, we integrated the presented approach into enviPath, a complete redesign and re-implementation of UM-PPS.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
Wicker, Jörg; Lorsbach, Tim; Gütlein, Martin; Schmid, Emanuel; Latino, Diogo; Kramer, Stefan; Fenner, Kathrin
enviPath – The Environmental Contaminant Biotransformation Pathway Resource Journal Article
In: Nucleic Acid Research, vol. 44, no. D1, pp. D502-D508, 2016.
@article{wicker2016envipath,
title = {enviPath - The Environmental Contaminant Biotransformation Pathway Resource},
author = {J\"{o}rg Wicker and Tim Lorsbach and Martin G\"{u}tlein and Emanuel Schmid and Diogo Latino and Stefan Kramer and Kathrin Fenner},
editor = {Michael Galperin},
url = {http://nar.oxfordjournals.org/content/44/D1/D502.abstract},
doi = {10.1093/nar/gkv1229},
year = {2016},
date = {2016-01-01},
journal = {Nucleic Acid Research},
volume = {44},
number = {D1},
pages = {D502-D508},
abstract = {The University of Minnesota Biocatalysis/Biodegradation Database and Pathway Prediction System (UM-BBD/PPS) has been a unique resource covering microbial biotransformation pathways of primarily xenobiotic chemicals for over 15 years. This paper introduces the successor system, enviPath (The Environmental Contaminant Biotransformation Pathway Resource), which is a complete redesign and reimplementation of UM-BBD/PPS. enviPath uses the database from the UM-BBD/PPS as a basis, extends the use of this database, and allows users to include their own data to support multiple use cases. Relative reasoning is supported for the refinement of predictions and to allow its extensions in terms of previously published, but not implemented machine learning models. User access is simplified by providing a REST API that simplifies the inclusion of enviPath into existing workflows. An RDF database is used to enable simple integration with other databases. enviPath is publicly available at https://envipath.org with free and open access to its core data.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2013
Wicker, Jörg
Large Classifier Systems in Bio- and Cheminformatics PhD Thesis
Technische Universität München, 2013.
@phdthesis{wicker2013large,
title = {Large Classifier Systems in Bio- and Cheminformatics},
author = {J\"{o}rg Wicker},
url = {http://mediatum.ub.tum.de/node?id=1165858},
year = {2013},
date = {2013-01-01},
school = {Technische Universit\"{a}t M\"{u}nchen},
abstract = {Large classifier systems are machine learning algorithms that use multiple
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.
2010
Hardy, Barry; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: Journal of Cheminformatics, vol. 2, no. 1, pp. 7, 2010, ISSN: 1758-2946.
@article{hardy2010collaborative,
title = {Collaborative development of predictive toxicology applications},
author = {Barry Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and J\"{o}rg Wicker and Andreas Karwath and Martin G\"{u}tlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://www.jcheminf.com/content/2/1/7},
doi = {10.1186/1758-2946-2-7},
issn = {1758-2946},
year = {2010},
date = {2010-01-01},
journal = {Journal of Cheminformatics},
volume = {2},
number = {1},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach Journal Article
In: Bioinformatics, vol. 26, no. 6, pp. 814-821, 2010.
@article{wicker2010predicting,
title = {Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
url = {http://bioinformatics.oxfordjournals.org/content/26/6/814.full},
doi = {10.1093/bioinformatics/btq024},
year = {2010},
date = {2010-01-01},
journal = {Bioinformatics},
volume = {26},
number = {6},
pages = {814-821},
publisher = {Oxford University Press},
abstract = {Motivation: Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this article, we propose a hybrid knowledge- and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. As the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice.Results: Results from leave-one-out cross-validation show that a recall and precision of ∼0.8 can be achieved for a subset of 13 transformation rules. Therefore, it is possible to optimize precision without compromising recall. We are currently integrating the results into an experimental version of the UM-PPS server.Availability: The program is freely available on the web at http://wwwkramer.in.tum.de/research/applications/biodegradation/data.Contact: kramer@in.tum.de},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2008
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction Proceedings Article
In: Bridewell, Will; Calders, Toon; Medeiros, Ana Karla; Kramer, Stefan; Pechenizkiy, Mykola; Todorovski, Ljupco (Ed.): Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008, 2008.
@inproceedings{wicker2008machine,
title = {Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
editor = {Will Bridewell and Toon Calders and Ana Karla Medeiros and Stefan Kramer and Mykola Pechenizkiy and Ljupco Todorovski},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/ipm/9.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}