2024
Long, Derek; Eade, Liam; Dost, Katharina; Meier-Menches, Samuel M; Goldstone, David C; Sullivan, Matthew P; Hartinger, Christian; Wicker, Jörg; Taskova, Katerina
AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra Journal Article
In: Journal of Cheminformatics, vol. 16, iss. 1, 2024, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry
@article{Long2023adducthunter,
title = {AdductHunter: Identifying Protein-Metal Complex Adducts in Mass Spectra},
author = {Derek Long and Liam Eade and Katharina Dost and Samuel M Meier-Menches and David C Goldstone and Matthew P Sullivan and Christian Hartinger and J\"{o}rg Wicker and Katerina Taskova},
url = {https://adducthunter.wickerlab.org
https://doi.org/10.21203/rs.3.rs-3322854/v1},
doi = {10.1186/s13321-023-00797-7},
issn = {1758-2946},
year = {2024},
date = {2024-02-06},
urldate = {2024-02-06},
journal = {Journal of Cheminformatics},
volume = {16},
issue = {1},
abstract = {Mass spectrometry (MS) is an analytical technique for molecule identification that can be used for investigating protein-metal complex interactions. Once the MS data is collected, the mass spectra are usually interpreted manually to identify the adducts formed as a result of the interactions between proteins and metal-based species. However, with increasing resolution, dataset size, and species complexity, the time required to identify adducts and the error-prone nature of manual assignment have become limiting factors in MS analysis. AdductHunter is a open-source web-based analysis tool that automates the peak identification process using constraint integer optimization to find feasible combinations of protein and fragments, and dynamic time warping to calculate the dissimilarity between the theoretical isotope pattern of a species and its experimental isotope peak distribution. Empirical evaluation on a collection of 22 unique MS datasetsshows fast and accurate identification of protein-metal complex adducts in deconvoluted mass spectra.},
keywords = {cheminformatics, computational sustainability, data mining, dynamic time warping, machine learning, mass spectrometry},
pubstate = {published},
tppubtype = {article}
}
2023
Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Stönner, Christof; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Williams, Jonathan; Kramer, Stefan
Cinema Experiments 2013 Miscellaneous
2023.
Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, cinema data mining, data mining, machine learning, smell of fear, sof
@misc{Wicker2023cinema,
title = {Cinema Experiments 2013},
author = { J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Christof St\"{o}nner and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Jonathan Williams and Stefan Kramer},
url = {https://auckland.figshare.com/articles/dataset/Cinema_Experiments_2013/22777364},
doi = {10.17608/k6.auckland.22777364.v3},
year = {2023},
date = {2023-05-23},
keywords = {atmospheric chemistry, cinema data mining, data mining, machine learning, smell of fear, sof},
pubstate = {published},
tppubtype = {misc}
}
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Cinema Experiments 2015 Miscellaneous
2023.
Links | BibTeX | Altmetric | PlumX | Tags: cinema data mining, data mining, machine learning, smell of fear, sof
@misc{St\"{o}nner2023cinema,
title = {Cinema Experiments 2015},
author = { Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
url = {https://auckland.figshare.com/articles/dataset/Cinema_Experiments_2015/22777352},
doi = {10.17608/k6.auckland.22777352.v2},
year = {2023},
date = {2023-05-23},
keywords = {cinema data mining, data mining, machine learning, smell of fear, sof},
pubstate = {published},
tppubtype = {misc}
}
Dost, Katharina; Pullar-Strecker, Zac; Brydon, Liam; Zhang, Kunyang; Hafner, Jasmin; Riddle, Pat; Wicker, Jörg
Combatting over-specialization bias in growing chemical databases Journal Article
In: Journal of Cheminformatics, vol. 15, iss. 1, pp. 53, 2023, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning
@article{Dost2023Combatting,
title = {Combatting over-specialization bias in growing chemical databases},
author = {Katharina Dost and Zac Pullar-Strecker and Liam Brydon and Kunyang Zhang and Jasmin Hafner and Pat Riddle and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-023-00716-w
},
doi = {10.1186/s13321-023-00716-w},
issn = {1758-2946},
year = {2023},
date = {2023-05-19},
urldate = {2023-05-19},
journal = {Journal of Cheminformatics},
volume = {15},
issue = {1},
pages = {53},
abstract = {Background
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.},
keywords = {bias, biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, multi-label classification, reliable machine learning},
pubstate = {published},
tppubtype = {article}
}
Predicting in advance the behavior of new chemical compounds can support the design process of new products by directing the research toward the most promising candidates and ruling out others. Such predictive models can be data-driven using Machine Learning or based on researchers’ experience and depend on the collection of past results. In either case: models (or researchers) can only make reliable assumptions about compounds that are similar to what they have seen before. Therefore, consequent usage of these predictive models shapes the dataset and causes a continuous specialization shrinking the applicability domain of all trained models on this dataset in the future, and increasingly harming model-based exploration of the space.
Proposed solution
In this paper, we propose cancels (CounterActiNg Compound spEciaLization biaS), a technique that helps to break the dataset specialization spiral. Aiming for a smooth distribution of the compounds in the dataset, we identify areas in the space that fall short and suggest additional experiments that help bridge the gap. Thereby, we generally improve the dataset quality in an entirely unsupervised manner and create awareness of potential flaws in the data. cancels does not aim to cover the entire compound space and hence retains a desirable degree of specialization to a specified research domain.
Results
An extensive set of experiments on the use-case of biodegradation pathway prediction not only reveals that the bias spiral can indeed be observed but also that cancels produces meaningful results. Additionally, we demonstrate that mitigating the observed bias is crucial as it cannot only intervene with the continuous specialization process, but also significantly improves a predictor’s performance while reducing the number of required experiments. Overall, we believe that cancels can support researchers in their experimentation process to not only better understand their data and potential flaws, but also to grow the dataset in a sustainable way. All code is available under github.com/KatDost/Cancels.
2021
Tam, Jason; Lorsbach, Tim; Schmidt, Sebastian; Wicker, Jörg
Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products Journal Article
In: Journal of Cheminformatics, vol. 13, no. 1, pp. 63, 2021.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways
@article{tam2021holisticb,
title = {Holistic Evaluation of Biodegradation Pathway Prediction: Assessing Multi-Step Reactions and Intermediate Products},
author = {Jason Tam and Tim Lorsbach and Sebastian Schmidt and J\"{o}rg Wicker},
url = {https://jcheminf.biomedcentral.com/articles/10.1186/s13321-021-00543-x
https://chemrxiv.org/articles/preprint/Holistic_Evaluation_of_Biodegradation_Pathway_Prediction_Assessing_Multi-Step_Reactions_and_Intermediate_Products/14315963
https://dx.doi.org/10.26434/chemrxiv.14315963},
doi = {10.1186/s13321-021-00543-x},
year = {2021},
date = {2021-09-03},
urldate = {2021-09-03},
journal = {Journal of Cheminformatics},
volume = {13},
number = {1},
pages = {63},
abstract = {The prediction of metabolism and biotransformation pathways of xenobiotics is a highly desired tool in environmental sciences, drug discovery, and (eco)toxicology. Several systems predict single transformation steps or complete pathways as series of parallel and subsequent steps. Their performance is commonly evaluated on the level of a single transformation step. Such an approach cannot account for some specific challenges that are caused by specific properties of biotransformation experiments. That is, missing transformation products in the reference data that occur only in low concentrations, e.g. transient intermediates or higher-generation metabolites. Furthermore, some rule-based prediction systems evaluate the performance only based on the defined set of transformation rules. Therefore, the performance of these models cannot be directly compared. In this paper, we introduce a new evaluation framework that extends the evaluation of biotransformation prediction from single transformations to whole pathways, taking into account multiple generations of metabolites. We introduce a procedure to address transient intermediates and propose a weighted scoring system that acknowledges the uncertainty of higher-generation metabolites. We implemented this framework in enviPath and demonstrate its strict performance metrics on predictions of in vitro biotransformation and degradation of xenobiotics in soil. Our approach is model-agnostic and can be transferred to other prediction systems. It is also capable of revealing knowledge gaps in terms of incompletely defined sets of transformation rules.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {article}
}
Stepišnik, Tomaž; Škrlj, Blaž; Wicker, Jörg; Kocev, Dragi
A comprehensive comparison of molecular feature representations for use in predictive modeling Journal Article
In: Computers in Biology and Medicine, vol. 130, pp. 104197, 2021, ISSN: 0010-4825.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, molecular feature representation, toxicity
@article{stepisnik2021comprehensive,
title = {A comprehensive comparison of molecular feature representations for use in predictive modeling},
author = {Toma\v{z} Stepi\v{s}nik and Bla\v{z} \v{S}krlj and J\"{o}rg Wicker and Dragi Kocev},
url = {http://www.sciencedirect.com/science/article/pii/S001048252030528X},
doi = {10.1016/j.compbiomed.2020.104197},
issn = {0010-4825},
year = {2021},
date = {2021-03-01},
journal = {Computers in Biology and Medicine},
volume = {130},
pages = {104197},
abstract = {Machine learning methods are commonly used for predicting molecular properties to accelerate material and drug design. An important part of this process is deciding how to represent the molecules. Typically, machine learning methods expect examples represented by vectors of values, and many methods for calculating molecular feature representations have been proposed. In this paper, we perform a comprehensive comparison of different molecular features, including traditional methods such as fingerprints and molecular descriptors, and recently proposed learnable representations based on neural networks. Feature representations are evaluated on 11 benchmark datasets, used for predicting properties and measures such as mutagenicity, melting points, activity, solubility, and IC50. Our experiments show that several molecular features work similarly well over all benchmark datasets. The ones that stand out most are Spectrophores, which give significantly worse performance than other features on most datasets. Molecular descriptors from the PaDEL library seem very well suited for predicting physical properties of molecules. Despite their simplicity, MACCS fingerprints performed very well overall. The results show that learnable representations achieve competitive performance compared to expert based representations. However, task-specific representations (graph convolutions and Weave methods) rarely offer any benefits, even though they are computationally more demanding. Lastly, combining different molecular feature representations typically does not give a noticeable improvement in performance compared to individual feature representations.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, machine learning, metabolic pathways, molecular feature representation, toxicity},
pubstate = {published},
tppubtype = {article}
}
2020
Chester, Andrew; Koh, Yun Sing; Wicker, Jörg; Sun, Quan; Lee, Junjae
Balancing Utility and Fairness against Privacy in Medical Data Proceedings Article
In: IEEE Symposium Series on Computational Intelligence (SSCI), pp. 1226-1233, IEEE, 2020.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy
@inproceedings{chester2020balancing,
title = {Balancing Utility and Fairness against Privacy in Medical Data},
author = {Andrew Chester and Yun Sing Koh and J\"{o}rg Wicker and Quan Sun and Junjae Lee},
url = {https://ieeexplore.ieee.org/abstract/document/9308226},
doi = {10.1109/SSCI47803.2020.9308226},
year = {2020},
date = {2020-12-01},
booktitle = {IEEE Symposium Series on Computational Intelligence (SSCI)},
pages = {1226-1233},
publisher = {IEEE},
abstract = {There are numerous challenges when designing algorithms that interact with sensitive data, such as, medical or financial records. One of these challenges is privacy. However, there is a tension between privacy, utility (model accuracy), and fairness. While de-identification techniques, such as generalisation and suppression, have been proposed to enable privacy protection, it comes with a cost, specifically to fairness and utility. Recent work on fairness in algorithm design defines fairness as a guarantee of similar outputs for "similar" input data. This notion is discussed in connection to de-identification. This research investigates the trade-off between privacy, fairness, and utility. In contrast, other work investigates the trade-off between privacy and utility of the data or accuracy of the model overall. In this research, we investigate the effects of two standard de-identification techniques, k-anonymity and differential privacy, on both utility and fairness. We propose two measures to calculate the trade-off between privacy-utility and privacy-fairness. Although other research has provided guarantees for privacy regarding utility, this research focuses on the trade-offs given set de-identification levels and relies on guarantees provided by the privacy preservation methods. We discuss the effects of de-identification on data of different characteristics, class imbalance and outcome imbalance. We evaluated this is on synthetic datasets and standard real-world datasets. As a case study, we analysed the Medical Expenditure Panel Survey dataset.},
keywords = {accuracy, computational sustainability, data mining, fairness, imbalance, machine learning, medicine, privacy},
pubstate = {published},
tppubtype = {inproceedings}
}
Dost, Katharina; Taskova, Katerina; Riddle, Pat; Wicker, Jörg
Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias Proceedings Article
In: 2020 IEEE International Conference on Data Mining (ICDM), pp. 996-1001, IEEE, 2020, ISSN: 2374-8486.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bias, data mining, fairness, machine learning
@inproceedings{dost2020your,
title = {Your Best Guess When You Know Nothing: Identification and Mitigation of Selection Bias},
author = {Katharina Dost and Katerina Taskova and Pat Riddle and J\"{o}rg Wicker},
url = {https://ieeexplore.ieee.org/document/9338355
https://github.com/KatDost/Imitate
https://pypi.org/project/imitatebias/},
doi = {10.1109/ICDM50108.2020.00115},
issn = {2374-8486},
year = {2020},
date = {2020-11-17},
urldate = {2020-11-17},
booktitle = {2020 IEEE International Conference on Data Mining (ICDM)},
pages = {996-1001},
publisher = {IEEE},
abstract = {Machine Learning typically assumes that training and test set are independently drawn from the same distribution, but this assumption is often violated in practice which creates a bias. Many attempts to identify and mitigate this bias have been proposed, but they usually rely on ground-truth information. But what if the researcher is not even aware of the bias?
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset\'s probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.},
keywords = {bias, data mining, fairness, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
In contrast to prior work, this paper introduces a new method, Imitate, to identify and mitigate Selection Bias in the case that we may not know if (and where) a bias is present, and hence no ground-truth information is available.
Imitate investigates the dataset's probability density, then adds generated points in order to smooth out the density and have it resemble a Gaussian, the most common density occurring in real-world applications. If the artificial points focus on certain areas and are not widespread, this could indicate a Selection Bias where these areas are underrepresented in the sample.
We demonstrate the effectiveness of the proposed method in both, synthetic and real-world datasets. We also point out limitations and future research directions.
Roeslin, Samuel; Ma, Quincy; Chigullapally, Pavan; Wicker, Jörg; Wotherspoon, Liam
Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience Proceedings Article
In: 17th World Conference on Earthquake Engineering, 2020.
Abstract | Links | BibTeX | Tags: computational sustainability, data mining, earthquakes, machine learning
@inproceedings{roeslin2020feature,
title = {Feature Engineering for a Seismic Loss Prediction Model using Machine Learning, Christchurch Experience},
author = {Samuel Roeslin and Quincy Ma and Pavan Chigullapally and J\"{o}rg Wicker and Liam Wotherspoon},
url = {https://www.researchgate.net/profile/Samuel_Roeslin/publication/344503593_Feature_Engineering_for_a_Seismic_Loss_Prediction_Model_using_Machine_Learning_Christchurch_Experience/links/5f7d015a92851c14bcb36ed7/Feature-Engineering-for-a-Seismic-Loss-Prediction-Model-using-Machine-Learning-Christchurch-Experience.pdf},
year = {2020},
date = {2020-09-17},
booktitle = {17th World Conference on Earthquake Engineering},
abstract = {The city of Christchurch, New Zealand experienced four major earthquakes (MW \> 5.9) and multiple aftershocks between 4 September 2010 and 23 December 2011. This series of earthquakes, commonly known as the Canterbury Earthquake Sequence (CES), induced over NZ$40 billion in total economic losses. Liquefaction alone led to building damage in 51,000 of the 140,000 residential buildings, with around 15,000 houses left unpractical to repair. Widespread damage to residential buildings highlighted the need for improved seismic prediction tools and to better understand factors influencing damage. Fortunately, due to New Zealand unique insurance setting, up to 80% of the losses were insured. Over the entire CES, insurers received more than 650,000 claims. This research project employs multi-disciplinary empirical data gathered during and prior to the CES to develop a seismic loss prediction model for residential buildings in Christchurch using machine learning. The intent is to develop a procedure for developing insights from post-earthquake data that is subjected to continuous updating, to enable identification of critical parameters affecting losses, and to apply such a model to establish priority building stock for risk mitigation measures. The following paper describes the complex data preparation process required for the application of machine learning techniques. The paper covers the production of a merged dataset with information from the Earthquake Commission (EQC) claim database, building characteristics from RiskScape, seismic demand interpolated from GeoNet strong motion records, liquefaction occurrence from the New Zealand Geotechnical Database (NZGD) and soil conditions from Land Resource Information Systems (LRIS).},
keywords = {computational sustainability, data mining, earthquakes, machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Roeslin, Samuel; Ma, Quincy; Juárez-Garcia, Hugon; Gómez-Bernal, Alonso; Wicker, Jörg; Wotherspoon, Liam
A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake Journal Article
In: Earthquake Spectra, vol. 36, no. 2, pp. 314-339, 2020.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, data mining, earthquakes, machine learning
@article{roeslin2020machine,
title = {A machine learning damage prediction model for the 2017 Puebla-Morelos, Mexico, earthquake},
author = {Samuel Roeslin and Quincy Ma and Hugon Ju\'{a}rez-Garcia and Alonso G\'{o}mez-Bernal and J\"{o}rg Wicker and Liam Wotherspoon},
doi = {https://doi.org/10.1177/8755293020936714},
year = {2020},
date = {2020-07-30},
journal = {Earthquake Spectra},
volume = {36},
number = {2},
pages = {314-339},
abstract = {The 2017 Puebla, Mexico, earthquake event led to significant damage in many buildings in Mexico City. In the months following the earthquake, civil engineering students conducted detailed building assessments throughout the city. They collected building damage information and structural characteristics for 340 buildings in the Mexico City urban area, with an emphasis on the Roma and Condesa neighborhoods where they assessed 237 buildings. These neighborhoods are of particular interest due to the availability of seismic records captured by nearby recording stations, and preexisting information from when the neighborhoods were affected by the 1985 Michoac\'{a}n earthquake. This article presents a case study on developing a damage prediction model using machine learning. It details a framework suitable for working with future post-earthquake observation data. Four algorithms able to perform classification tasks were trialed. Random forest, the best performing algorithm, achieves more than 65% prediction accuracy. The study of the feature importance for the random forest shows that the building location, seismic demand, and building height are the parameters that influence the model output the most.},
keywords = {computational sustainability, data mining, earthquakes, machine learning},
pubstate = {published},
tppubtype = {article}
}
2019
Wicker, Jörg; Hua, Yan Cathy; Rebello, Rayner; Pfahringer, Bernhard
XOR-based Boolean Matrix Decomposition Proceedings Article
In: Wang, Jianyong; Shim, Kyuseok; Wu, Xindong (Ed.): 2019 IEEE International Conference on Data Mining (ICDM), pp. 638-647, IEEE, 2019, ISBN: 978-1-7281-4604-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Boolean matrix decomposition, data mining
@inproceedings{wicker2019xor,
title = {XOR-based Boolean Matrix Decomposition},
author = {J\"{o}rg Wicker and Yan Cathy Hua and Rayner Rebello and Bernhard Pfahringer},
editor = {Jianyong Wang and Kyuseok Shim and Xindong Wu},
url = {https://ieeexplore.ieee.org/document/8970951},
doi = {10.1109/ICDM.2019.00074},
isbn = {978-1-7281-4604-1},
year = {2019},
date = {2019-11-08},
urldate = {2019-11-08},
booktitle = {2019 IEEE International Conference on Data Mining (ICDM)},
pages = {638-647},
publisher = {IEEE},
abstract = {Boolean matrix factorization (BMF) is a data summarizing and dimension-reduction technique. Existing BMF methods build on matrix properties defined by Boolean algebra, where the addition operator is the logical inclusive OR and the multiplication operator the logical AND. As a consequence, this leads to the lack of an additive inverse in all Boolean matrix operations, which produces
an indelible type of approximation error. Previous research adopted various methods to address such an issue and produced reasonably accurate approximation. However, an exact factorization is rarely found in the literature. In this paper, we introduce a new algorithm named XBMaD (Xor-based Boolean Matrix Decomposition) where the addition operator is defined as the exclusive OR (XOR). This change completely removes the error-mitigation issue of OR-based BMF methods, and allows for an exact error-free factorization. An evaluation comparing XBMaD and classic OR-based methods suggested that XBMAD performed equal or in most cases more accurately and faster.},
keywords = {Boolean matrix decomposition, data mining},
pubstate = {published},
tppubtype = {inproceedings}
}
an indelible type of approximation error. Previous research adopted various methods to address such an issue and produced reasonably accurate approximation. However, an exact factorization is rarely found in the literature. In this paper, we introduce a new algorithm named XBMaD (Xor-based Boolean Matrix Decomposition) where the addition operator is defined as the exclusive OR (XOR). This change completely removes the error-mitigation issue of OR-based BMF methods, and allows for an exact error-free factorization. An evaluation comparing XBMaD and classic OR-based methods suggested that XBMAD performed equal or in most cases more accurately and faster.
Williams, Jonathan; Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Krauter, Nicolas; Wicker, Jörg; Kramer, Stefan
What can we learn from the air chemistry of crowds? Proceedings Article
In: Hansel, Armin; Dunkl, Jürgen (Ed.): 8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications, pp. 121-123, Innsbruck University Press, Innsbruck, 2019.
Abstract | Links | BibTeX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@inproceedings{williams2019what,
title = {What can we learn from the air chemistry of crowds?},
author = {Jonathan Williams and Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Nicolas Krauter and J\"{o}rg Wicker and Stefan Kramer},
editor = {Armin Hansel and J\"{u}rgen Dunkl},
url = {https://www.ionicon.com/sites/default/files/uploads/doc/Contributions_8th-PTR-MS-Conference-2019_web.pdf#page=122},
year = {2019},
date = {2019-05-10},
booktitle = {8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications},
pages = {121-123},
publisher = {Innsbruck University Press},
address = {Innsbruck},
abstract = {Current PTR-MS technology allows hundreds of volatile trace gases in air to be measured every second at extremely low levels (parts per trillion). These instruments are often used in atmospheric research on planes and ships and even in the Amazon rainforest. Recently, we have used this technology to examine air composition changes caused by large groups of people (10,000-30,000) under real world conditions at a football match and in a movie theater. In both cases the trace gas signatures measured in ambient air are shown to reflect crowd behavior. By applying advanced data mining techniques we have shown that groups of people reproducibly respond to certain emotional stimuli (e.g. suspense and comedy) by exhaling specific trace gases. Furthermore, we explore whether this information can be used to determine the age classification of films.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
2018
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Proof of concept study: Testing human volatile organic compounds as tools for age classification of films Journal Article
In: PLOS One, vol. 13, no. 10, pp. 1-14, 2018.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@article{Stonner2018,
title = {Proof of concept study: Testing human volatile organic compounds as tools for age classification of films},
author = {Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
doi = {10.1371/journal.pone.0203044},
year = {2018},
date = {2018-10-11},
journal = {PLOS One},
volume = {13},
number = {10},
pages = {1-14},
publisher = {Public Library of Science},
abstract = {Humans emit numerous volatile organic compounds (VOCs) through breath and skin. The nature and rate of these emissions are affected by various factors including emotional state. Previous measurements of VOCs and CO2 in a cinema have shown that certain chemicals are reproducibly emitted by audiences reacting to events in a particular film. Using data from films with various age classifications, we have studied the relationship between the emission of multiple VOCs and CO2 and the age classifier (0, 6, 12, and 16) with a view to developing a new chemically based and objective film classification method. We apply a random forest model built with time independent features extracted from the time series of every measured compound, and test predictive capability on subsets of all data. It was found that most compounds were not able to predict all age classifiers reliably, likely reflecting the fact that current classification is based on perceived sensibilities to many factors (e.g. incidences of violence, sex, antisocial behaviour, drug use, and bad language) rather than the visceral biological responses expressed in the data. However, promising results were found for isoprene which reliably predicted 0, 6 and 12 age classifiers for a variety of film genres and audience age groups. Therefore, isoprene emission per person might in future be a valuable aid to national classification boards, or even offer an alternative, objective, metric for rating films based on the reactions of large groups of people.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
2017
Latino, Diogo; Wicker, Jörg; Gütlein, Martin; Schmid, Emanuel; Kramer, Stefan; Fenner, Kathrin
Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data Journal Article
In: Environmental Science: Process & Impact, 2017.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, multi-label classification, REST, web services
@article{latino2017eawag,
title = {Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data},
author = {Diogo Latino and J\"{o}rg Wicker and Martin G\"{u}tlein and Emanuel Schmid and Stefan Kramer and Kathrin Fenner},
doi = {10.1039/C6EM00697C},
year = {2017},
date = {2017-01-01},
journal = {Environmental Science: Process \& Impact},
publisher = {The Royal Society of Chemistry},
abstract = {Developing models for the prediction of microbial biotransformation pathways and half-lives of trace organic contaminants in different environments requires as training data easily accessible and sufficiently large collections of respective biotransformation data that are annotated with metadata on study conditions. Here, we present the Eawag-Soil package, a public database that has been developed to contain all freely accessible regulatory data on pesticide degradation in laboratory soil simulation studies
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, multi-label classification, REST, web services},
pubstate = {published},
tppubtype = {article}
}
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.
2016
Wicker, Jörg; Lorsbach, Tim; Gütlein, Martin; Schmid, Emanuel; Latino, Diogo; Kramer, Stefan; Fenner, Kathrin
enviPath – The Environmental Contaminant Biotransformation Pathway Resource Journal Article
In: Nucleic Acid Research, vol. 44, no. D1, pp. D502-D508, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification
@article{wicker2016envipath,
title = {enviPath - The Environmental Contaminant Biotransformation Pathway Resource},
author = {J\"{o}rg Wicker and Tim Lorsbach and Martin G\"{u}tlein and Emanuel Schmid and Diogo Latino and Stefan Kramer and Kathrin Fenner},
editor = {Michael Galperin},
url = {http://nar.oxfordjournals.org/content/44/D1/D502.abstract},
doi = {10.1093/nar/gkv1229},
year = {2016},
date = {2016-01-01},
journal = {Nucleic Acid Research},
volume = {44},
number = {D1},
pages = {D502-D508},
abstract = {The University of Minnesota Biocatalysis/Biodegradation Database and Pathway Prediction System (UM-BBD/PPS) has been a unique resource covering microbial biotransformation pathways of primarily xenobiotic chemicals for over 15 years. This paper introduces the successor system, enviPath (The Environmental Contaminant Biotransformation Pathway Resource), which is a complete redesign and reimplementation of UM-BBD/PPS. enviPath uses the database from the UM-BBD/PPS as a basis, extends the use of this database, and allows users to include their own data to support multiple use cases. Relative reasoning is supported for the refinement of predictions and to allow its extensions in terms of previously published, but not implemented machine learning models. User access is simplified by providing a REST API that simplifies the inclusion of enviPath into existing workflows. An RDF database is used to enable simple integration with other databases. enviPath is publicly available at https://envipath.org with free and open access to its core data.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {article}
}
Raza, Atif; Wicker, Jörg; Kramer, Stefan
Trading Off Accuracy for Efficiency by Randomized Greedy Warping Proceedings Article
In: Proceedings of the 31st Annual ACM Symposium on Applied Computing, pp. 883-890, ACM, New York, NY, USA, 2016, ISBN: 978-1-4503-3739-7.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, dynamic time warping, time series
@inproceedings{raza2016trading,
title = {Trading Off Accuracy for Efficiency by Randomized Greedy Warping},
author = {Atif Raza and J\"{o}rg Wicker and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10030
http://doi.acm.org/10.1145/2851613.2851651},
doi = {10.1145/2851613.2851651},
isbn = {978-1-4503-3739-7},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 31st Annual ACM Symposium on Applied Computing},
pages = {883-890},
publisher = {ACM},
address = {New York, NY, USA},
series = {SAC '16},
abstract = {Dynamic Time Warping (DTW) is a widely used distance measure for time series data mining. Its quadratic complexity requires the application of various techniques (e.g. warping constraints, lower-bounds) for deployment in real-time scenarios. In this paper we propose a randomized greedy warping algorithm for f i nding similarity between time series instances.We show that the proposed algorithm outperforms the simple greedy approach and also provides very good time series similarity approximation consistently, as compared to DTW. We show that the Randomized Time Warping (RTW) can be used in place of DTW as a fast similarity approximation technique by trading some classification accuracy for very fast classification.},
keywords = {data mining, dynamic time warping, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Williams, Jonathan; Stönner, Christof; Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Kramer, Stefan
Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath Journal Article
In: Scientific Reports, vol. 6, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series
@article{williams2015element,
title = {Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath},
author = {Jonathan Williams and Christof St\"{o}nner and J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Stefan Kramer},
url = {http://www.nature.com/articles/srep25464},
doi = {10.1038/srep25464},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
journal = {Scientific Reports},
volume = {6},
publisher = {Nature Publishing Group},
abstract = {Human beings continuously emit chemicals into the air by breath and through the skin. In order to determine whether these emissions vary predictably in response to audiovisual stimuli, we have continuously monitored carbon dioxide and over one hundred volatile organic compounds in a cinema. It was found that many airborne chemicals in cinema air varied distinctively and reproducibly with time for a particular film, even in different screenings to different audiences. Application of scene labels and advanced data mining methods revealed that specific film events, namely "suspense" or "comedy" caused audiences to change their emission of specific chemicals. These event-type synchronous, broadcasted human chemosignals open the possibility for objective and non-invasive assessment of a human group response to stimuli by continuous measurement of chemicals in air. Such methods can be applied to research fields such as psychology and biology, and be valuable to industries such as film making and advertising.},
keywords = {atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
2015
Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Stönner, Christof; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Williams, Jonathan; Kramer, Stefan
Cinema Data Mining: The Smell of Fear Proceedings Article
In: Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1235-1304, ACM ACM, New York, NY, USA, 2015, ISBN: 978-1-4503-3664-2.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series
@inproceedings{wicker2015cinema,
title = {Cinema Data Mining: The Smell of Fear},
author = {J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Christof St\"{o}nner and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Jonathan Williams and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10031
http://doi.acm.org/10.1145/2783258.2783404},
doi = {10.1145/2783258.2783404},
isbn = {978-1-4503-3664-2},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {1235-1304},
publisher = {ACM},
address = {New York, NY, USA},
organization = {ACM},
series = {KDD '15},
abstract = {While the physiological response of humans to emotional events or stimuli is well-investigated for many modalities (like EEG, skin resistance, ...), surprisingly little is known about the exhalation of so-called Volatile Organic Compounds (VOCs) at quite low concentrations in response to such stimuli. VOCs are molecules of relatively small mass that quickly evaporate or sublimate and can be detected in the air that surrounds us. The paper introduces a new field of application for data mining, where trace gas responses of people reacting on-line to films shown in cinemas (or movie theaters) are related to the semantic content of the films themselves. To do so, we measured the VOCs from a movie theatre over a whole month in intervals of thirty seconds, and annotated the screened films by a controlled vocabulary compiled from multiple sources. To gain a better understanding of the data and to reveal unknown relationships, we have built prediction models for so-called forward prediction (the prediction of future VOCs from the past), backward prediction (the prediction of past scene labels from future VOCs) and for some forms of abductive reasoning and Granger causality. Experimental results show that some VOCs and some labels can be predicted with relatively low error, and that hints for causality with low p-values can be detected in the data.},
keywords = {atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Šilc, Jurij; Taškova, Katerina; Korošec, Peter
Data mining-assisted parameter tuning of a search algorithm Journal Article
In: Informatica, vol. 39, no. 2, 2015.
Abstract | Links | BibTeX | Tags: data mining
@article{vsilc2015data,
title = {Data mining-assisted parameter tuning of a search algorithm},
author = {Jurij \v{S}ilc and Katerina Ta\v{s}kova and Peter Koro\v{s}ec},
url = {https://informatica.si/index.php/informatica/article/view/833},
year = {2015},
date = {2015-01-01},
urldate = {2015-01-01},
journal = {Informatica},
volume = {39},
number = {2},
abstract = {The main purpose of this paper is to show how using data-mining technique to tackle the problem of tuning the performance of a meta-heuristic search algorithm with respect to its parameters. The operational behavior of typical meta-heuristic search algorithms is determined by a set of control parameters, which have to be fine-tuned in order to obtain a best performance for a given problem. The principle challenge here is how to provide meaningful settings for an algorithm, obtained as result of better insight in its behavior. In this context, we discuss the idea of learning a model of an algorithm behavior by data mining analysis of parameter tuning results. The study was conducted using the Differential Ant-Stigmergy Algorithm as an example meta-heuristic search algorithm.},
keywords = {data mining},
pubstate = {published},
tppubtype = {article}
}
2014
Tyukin, Andrey; Kramer, Stefan; Wicker, Jörg
BMaD — A Boolean Matrix Decomposition Framework Proceedings Article
In: Calders, Toon; Esposito, Floriana; Hüllermeier, Eyke; Meo, Rosa (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 481-484, Springer Berlin Heidelberg, 2014, ISBN: 978-3-662-44844-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Boolean matrix decomposition, data mining, framework
@inproceedings{tyukin2014bmad,
title = {BMaD -- A Boolean Matrix Decomposition Framework},
author = {Andrey Tyukin and Stefan Kramer and J\"{o}rg Wicker},
editor = {Toon Calders and Floriana Esposito and Eyke H\"{u}llermeier and Rosa Meo},
url = {http://dx.doi.org/10.1007/978-3-662-44845-8_40},
doi = {10.1007/978-3-662-44845-8_40},
isbn = {978-3-662-44844-1},
year = {2014},
date = {2014-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {8726},
pages = {481-484},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {Boolean matrix decomposition is a method to obtain a compressed
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.},
keywords = {Boolean matrix decomposition, data mining, framework},
pubstate = {published},
tppubtype = {inproceedings}
}
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.
2013
Wicker, Jörg
Large Classifier Systems in Bio- and Cheminformatics PhD Thesis
Technische Universität München, 2013.
Abstract | Links | BibTeX | Tags: biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity
@phdthesis{wicker2013large,
title = {Large Classifier Systems in Bio- and Cheminformatics},
author = {J\"{o}rg Wicker},
url = {http://mediatum.ub.tum.de/node?id=1165858},
year = {2013},
date = {2013-01-01},
school = {Technische Universit\"{a}t M\"{u}nchen},
abstract = {Large classifier systems are machine learning algorithms that use multiple
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.},
keywords = {biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity},
pubstate = {published},
tppubtype = {phdthesis}
}
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.
2010
Hardy, Barry; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: Journal of Cheminformatics, vol. 2, no. 1, pp. 7, 2010, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity
@article{hardy2010collaborative,
title = {Collaborative development of predictive toxicology applications},
author = {Barry Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and J\"{o}rg Wicker and Andreas Karwath and Martin G\"{u}tlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://www.jcheminf.com/content/2/1/7},
doi = {10.1186/1758-2946-2-7},
issn = {1758-2946},
year = {2010},
date = {2010-01-01},
journal = {Journal of Cheminformatics},
volume = {2},
number = {1},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Richter, Lothar; Kramer, Stefan
SINDBAD and SiQL: Overview, Applications and Future Developments Book Section
In: Džeroski, Sašo; Goethals, Bart; Panov, Panče (Ed.): Inductive Databases and Constraint-Based Data Mining, pp. 289-309, Springer New York, 2010, ISBN: 978-1-4419-7737-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@incollection{wicker2010sindbad,
title = {SINDBAD and SiQL: Overview, Applications and Future Developments},
author = {J\"{o}rg Wicker and Lothar Richter and Stefan Kramer},
editor = {Sa\v{s}o D\v{z}eroski and Bart Goethals and Pan\v{c}e Panov},
url = {http://dx.doi.org/10.1007/978-1-4419-7738-0_12},
doi = {10.1007/978-1-4419-7738-0_12},
isbn = {978-1-4419-7737-3},
year = {2010},
date = {2010-01-01},
booktitle = {Inductive Databases and Constraint-Based Data Mining},
pages = {289-309},
publisher = {Springer New York},
abstract = {The chapter gives an overview of the current state of the Sindbad system and planned extensions. Following an introduction to the system and its query language SiQL, we present application scenarios from the areas of gene expression/regulation and small molecules. Next, we describe a web service interface to Sindbad that enables new possibilities for inductive databases (distributing tasks over multiple servers, language and platform independence, …). Finally, we discuss future plans for the system, in particular, to make the system more ‘declarative’ by the use of signatures, to integrate the useful concept of mining views into the system, and to support specific pattern domains like graphs and strings.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {incollection}
}
2008
Wicker, Jörg; Richter, Lothar; Kessler, Kristina; Kramer, Stefan
SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Daelemans, Walter; Goethals, Bart; Morik, Katharina (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 690-694, Springer Berlin Heidelberg, 2008, ISBN: 978-3-540-87480-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbad,
title = {SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model},
author = {J\"{o}rg Wicker and Lothar Richter and Kristina Kessler and Stefan Kramer},
editor = {Walter Daelemans and Bart Goethals and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-540-87481-2_48},
doi = {10.1007/978-3-540-87481-2_48},
isbn = {978-3-540-87480-5},
year = {2008},
date = {2008-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {5212},
pages = {690-694},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {In this demonstration, we will present the concepts and an implementation of an inductive database \textendash as proposed by Imielinski and Mannila \textendash in the relational model. The goal is to support all steps of the knowledge discovery process on the basis of queries to a database system. The query language SiQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and software architecture significantly. The prototype is applied to three different data sets: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Richter, Lothar; Wicker, Jörg; Kessler, Kristina; Kramer, Stefan
An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology, pp. 740–744, ACM, 2008, ISBN: 978-1-59593-926-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{richter2008inductive,
title = {An Inductive Database and Query Language in the Relational Model},
author = {Lothar Richter and J\"{o}rg Wicker and Kristina Kessler and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10033
http://doi.acm.org/10.1145/1353343.1353440},
doi = {10.1145/1353343.1353440},
isbn = {978-1-59593-926-5},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology},
pages = {740--744},
publisher = {ACM},
series = {EDBT '08},
abstract = {In the demonstration, we will present the concepts and an implementation of an inductive database -- as proposed by Imielinski and Mannila -- in the relational model. The goal is to support all steps of the knowledge discovery process, from pre-processing via data mining to post-processing, on the basis of queries to a database system. The query language SIQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. Key concepts of this system, among others, are the closure of operators and distances between objects. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and the software architecture significantly. The prototype is applied to three different applications: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Brosdau, Christoph; Richter, Lothar; Kramer, Stefan
SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes Proceedings Article
In: Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery, 2008.
Abstract | Links | BibTeX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbadsails,
title = {SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes},
author = {J\"{o}rg Wicker and Christoph Brosdau and Lothar Richter and Stefan Kramer},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/sokd/2.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery},
abstract = {The paper presents SINDBAD SAILS (Service Architecture for Inductive Learning Schemes), a Web Service interface to the inductive database SINDBAD. To the best of our knowledge, it is the first time a Web Service interface is provided for an inductive database. The combination of service-oriented architectures and inductive databases is particularly useful, as it enables distributed data mining without the need to install specialized data mining or machine learning software. Moreover, inductive queries can easily be used in almost any kind of programming language. The paper discusses the underlying concepts and explains a sample program making use of SINDBAD SAILS.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Kramer, Stefan; Aufschild, Volker; Hapfelmeier, Andreas; Jarasch, Alexander; Kessler, Kristina; Reckow, Stefan; Wicker, Jörg; Richter, Lothar
Inductive Databases in the Relational Model: The Data as the Bridge Proceedings Article
In: Bonchi, Francesco; Boulicaut, Jean-François (Ed.): Knowledge Discovery in Inductive Databases, pp. 124-138, Springer Berlin Heidelberg, 2006, ISBN: 978-3-540-33292-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{kramer2006inductive,
title = {Inductive Databases in the Relational Model: The Data as the Bridge},
author = {Stefan Kramer and Volker Aufschild and Andreas Hapfelmeier and Alexander Jarasch and Kristina Kessler and Stefan Reckow and J\"{o}rg Wicker and Lothar Richter},
editor = {Francesco Bonchi and Jean-Fran\c{c}ois Boulicaut},
url = {http://dx.doi.org/10.1007/11733492_8},
doi = {10.1007/11733492_8},
isbn = {978-3-540-33292-3},
year = {2006},
date = {2006-01-01},
booktitle = {Knowledge Discovery in Inductive Databases},
volume = {3933},
pages = {124-138},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {We present a new and comprehensive approach to inductive databases in the relational model. The main contribution is a new inductive query language extending SQL, with the goal of supporting the whole knowledge discovery process, from pre-processing via data mining to post-processing. A prototype system supporting the query language was developed in the SINDBAD (structured inductive database development) project. Setting aside models and focusing on distance-based and instance-based methods, closure can easily be achieved. An example scenario from the area of gene expression data analysis demonstrates the power and simplicity of the concept. We hope that this preliminary work will help to bring the fundamental issues, such as the integration of various pattern domains and data mining techniques, to the attention of the inductive database community.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}