2016
Wicker, Jörg; Tyukin, Andrey; Kramer, Stefan
A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders Proceedings Article
In: Bailey, James; Khan, Latifur; Washio, Takashi; Dobbie, Gill; Huang, Zhexue Joshua; Wang, Ruili (Ed.): The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 328-340, Springer International Publishing, Switzerland, 2016, ISBN: 978-3-319-31753-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: autoencoders, label compression, machine learning, multi-label classification
@inproceedings{wicker2016nonlinear,
title = {A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders},
author = {J\"{o}rg Wicker and Andrey Tyukin and Stefan Kramer},
editor = {James Bailey and Latifur Khan and Takashi Washio and Gill Dobbie and Zhexue Joshua Huang and Ruili Wang},
url = {http://dx.doi.org/10.1007/978-3-319-31753-3_27},
doi = {10.1007/978-3-319-31753-3_27},
isbn = {978-3-319-31753-3},
year = {2016},
date = {2016-04-16},
booktitle = {The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
volume = {9651},
pages = {328-340},
publisher = {Springer International Publishing},
address = {Switzerland},
series = {Lecture Notes in Computer Science},
abstract = {Multi-label classification targets the prediction of multiple interdependent and non-exclusive binary target variables. Transformation-based algorithms transform the data set such that regular single-label algorithms can be applied to the problem. A special type of transformation-based classifiers are label compression methods, that compress the labels and then mostly use single label classifiers to predict the compressed labels. So far, there are no compression-based algorithms follow a problem transformation approach and address non-linear dependencies in the labels. In this paper, we propose a new algorithm, called Maniac (Multi-lAbel classificatioN usIng AutoenCoders), which extracts the non-linear dependencies by compressing the labels using autoencoders. We adapt the training process of autoencoders in a way to make them more suitable for a parameter optimization in the context of this algorithm. The method is evaluated on eight standard multi-label data sets. Experiments show that despite not producing a good ranking, Maniac generates a particularly good bipartition of the labels into positives and negatives. This is caused by rather strong predictions with either really high or low probability. Additionally, the algorithm seems to perform better given more labels and a higher label cardinality in the data set.},
keywords = {autoencoders, label compression, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Lorsbach, Tim; Gütlein, Martin; Schmid, Emanuel; Latino, Diogo; Kramer, Stefan; Fenner, Kathrin
enviPath – The Environmental Contaminant Biotransformation Pathway Resource Journal Article
In: Nucleic Acid Research, vol. 44, no. D1, pp. D502-D508, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification
@article{wicker2016envipath,
title = {enviPath - The Environmental Contaminant Biotransformation Pathway Resource},
author = {J\"{o}rg Wicker and Tim Lorsbach and Martin G\"{u}tlein and Emanuel Schmid and Diogo Latino and Stefan Kramer and Kathrin Fenner},
editor = {Michael Galperin},
url = {http://nar.oxfordjournals.org/content/44/D1/D502.abstract},
doi = {10.1093/nar/gkv1229},
year = {2016},
date = {2016-01-01},
journal = {Nucleic Acid Research},
volume = {44},
number = {D1},
pages = {D502-D508},
abstract = {The University of Minnesota Biocatalysis/Biodegradation Database and Pathway Prediction System (UM-BBD/PPS) has been a unique resource covering microbial biotransformation pathways of primarily xenobiotic chemicals for over 15 years. This paper introduces the successor system, enviPath (The Environmental Contaminant Biotransformation Pathway Resource), which is a complete redesign and reimplementation of UM-BBD/PPS. enviPath uses the database from the UM-BBD/PPS as a basis, extends the use of this database, and allows users to include their own data to support multiple use cases. Relative reasoning is supported for the refinement of predictions and to allow its extensions in terms of previously published, but not implemented machine learning models. User access is simplified by providing a REST API that simplifies the inclusion of enviPath into existing workflows. An RDF database is used to enable simple integration with other databases. enviPath is publicly available at https://envipath.org with free and open access to its core data.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {article}
}
Raza, Atif; Wicker, Jörg; Kramer, Stefan
Trading Off Accuracy for Efficiency by Randomized Greedy Warping Proceedings Article
In: Proceedings of the 31st Annual ACM Symposium on Applied Computing, pp. 883-890, ACM, New York, NY, USA, 2016, ISBN: 978-1-4503-3739-7.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, dynamic time warping, time series
@inproceedings{raza2016trading,
title = {Trading Off Accuracy for Efficiency by Randomized Greedy Warping},
author = {Atif Raza and J\"{o}rg Wicker and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10030
http://doi.acm.org/10.1145/2851613.2851651},
doi = {10.1145/2851613.2851651},
isbn = {978-1-4503-3739-7},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 31st Annual ACM Symposium on Applied Computing},
pages = {883-890},
publisher = {ACM},
address = {New York, NY, USA},
series = {SAC '16},
abstract = {Dynamic Time Warping (DTW) is a widely used distance measure for time series data mining. Its quadratic complexity requires the application of various techniques (e.g. warping constraints, lower-bounds) for deployment in real-time scenarios. In this paper we propose a randomized greedy warping algorithm for f i nding similarity between time series instances.We show that the proposed algorithm outperforms the simple greedy approach and also provides very good time series similarity approximation consistently, as compared to DTW. We show that the Randomized Time Warping (RTW) can be used in place of DTW as a fast similarity approximation technique by trading some classification accuracy for very fast classification.},
keywords = {data mining, dynamic time warping, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Williams, Jonathan; Stönner, Christof; Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Kramer, Stefan
Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath Journal Article
In: Scientific Reports, vol. 6, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series
@article{williams2015element,
title = {Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath},
author = {Jonathan Williams and Christof St\"{o}nner and J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Stefan Kramer},
url = {http://www.nature.com/articles/srep25464},
doi = {10.1038/srep25464},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
journal = {Scientific Reports},
volume = {6},
publisher = {Nature Publishing Group},
abstract = {Human beings continuously emit chemicals into the air by breath and through the skin. In order to determine whether these emissions vary predictably in response to audiovisual stimuli, we have continuously monitored carbon dioxide and over one hundred volatile organic compounds in a cinema. It was found that many airborne chemicals in cinema air varied distinctively and reproducibly with time for a particular film, even in different screenings to different audiences. Application of scene labels and advanced data mining methods revealed that specific film events, namely "suspense" or "comedy" caused audiences to change their emission of specific chemicals. These event-type synchronous, broadcasted human chemosignals open the possibility for objective and non-invasive assessment of a human group response to stimuli by continuous measurement of chemicals in air. Such methods can be applied to research fields such as psychology and biology, and be valuable to industries such as film making and advertising.},
keywords = {atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
2015
Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Stönner, Christof; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Williams, Jonathan; Kramer, Stefan
Cinema Data Mining: The Smell of Fear Proceedings Article
In: Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1235-1304, ACM ACM, New York, NY, USA, 2015, ISBN: 978-1-4503-3664-2.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series
@inproceedings{wicker2015cinema,
title = {Cinema Data Mining: The Smell of Fear},
author = {J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Christof St\"{o}nner and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Jonathan Williams and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10031
http://doi.acm.org/10.1145/2783258.2783404},
doi = {10.1145/2783258.2783404},
isbn = {978-1-4503-3664-2},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {1235-1304},
publisher = {ACM},
address = {New York, NY, USA},
organization = {ACM},
series = {KDD '15},
abstract = {While the physiological response of humans to emotional events or stimuli is well-investigated for many modalities (like EEG, skin resistance, ...), surprisingly little is known about the exhalation of so-called Volatile Organic Compounds (VOCs) at quite low concentrations in response to such stimuli. VOCs are molecules of relatively small mass that quickly evaporate or sublimate and can be detected in the air that surrounds us. The paper introduces a new field of application for data mining, where trace gas responses of people reacting on-line to films shown in cinemas (or movie theaters) are related to the semantic content of the films themselves. To do so, we measured the VOCs from a movie theatre over a whole month in intervals of thirty seconds, and annotated the screened films by a controlled vocabulary compiled from multiple sources. To gain a better understanding of the data and to reveal unknown relationships, we have built prediction models for so-called forward prediction (the prediction of future VOCs from the past), backward prediction (the prediction of past scene labels from future VOCs) and for some forms of abductive reasoning and Granger causality. Experimental results show that some VOCs and some labels can be predicted with relatively low error, and that hints for causality with low p-values can be detected in the data.},
keywords = {atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Tyukin, Andrey; Kramer, Stefan; Wicker, Jörg
Scavenger – A Framework for the Efficient Evaluation of Dynamic and Modular Algorithms Proceedings Article
In: Bifet, Albert; May, Michael; Zadrozny, Bianca; Gavalda, Ricard; Pedreschi, Dino; Cardoso, Jaime; Spiliopoulou, Myra (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 325-328, Springer International Publishing, 2015, ISBN: 978-3-319-23460-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: autoencoders, distributed processing, framework, large-scale, Scavenger
@inproceedings{tyukin2015scavenger,
title = {Scavenger - A Framework for the Efficient Evaluation of Dynamic and Modular Algorithms},
author = {Andrey Tyukin and Stefan Kramer and J\"{o}rg Wicker},
editor = {Albert Bifet and Michael May and Bianca Zadrozny and Ricard Gavalda and Dino Pedreschi and Jaime Cardoso and Myra Spiliopoulou},
url = {http://dx.doi.org/10.1007/978-3-319-23461-8_40},
doi = {10.1007/978-3-319-23461-8_40},
isbn = {978-3-319-23460-1},
year = {2015},
date = {2015-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {9286},
pages = {325-328},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
abstract = {Machine Learning methods and algorithms are often highly modular in the sense that they rely on a large number of subalgorithms that are in principle interchangeable. For example, it is often possible to use various kinds of pre- and post-processing and various base classifiers or regressors as components of the same modular approach. We propose a framework, called Scavenger, that allows evaluating whole families of conceptually similar algorithms efficiently. The algorithms are represented as compositions, couplings and products of atomic subalgorithms. This allows partial results to be cached and shared between different instances of a modular algorithm, so that potentially expensive partial results need not be recomputed multiple times. Furthermore, our framework deals with issues of the parallel execution, load balancing, and with the backup of partial results for the case of implementation or runtime errors. Scavenger is licensed under the GPLv3 and can be downloaded freely at https://github.com/jorro/scavenger.},
keywords = {autoencoders, distributed processing, framework, large-scale, Scavenger},
pubstate = {published},
tppubtype = {inproceedings}
}
2014
Tyukin, Andrey; Kramer, Stefan; Wicker, Jörg
BMaD — A Boolean Matrix Decomposition Framework Proceedings Article
In: Calders, Toon; Esposito, Floriana; Hüllermeier, Eyke; Meo, Rosa (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 481-484, Springer Berlin Heidelberg, 2014, ISBN: 978-3-662-44844-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Boolean matrix decomposition, data mining, framework
@inproceedings{tyukin2014bmad,
title = {BMaD -- A Boolean Matrix Decomposition Framework},
author = {Andrey Tyukin and Stefan Kramer and J\"{o}rg Wicker},
editor = {Toon Calders and Floriana Esposito and Eyke H\"{u}llermeier and Rosa Meo},
url = {http://dx.doi.org/10.1007/978-3-662-44845-8_40},
doi = {10.1007/978-3-662-44845-8_40},
isbn = {978-3-662-44844-1},
year = {2014},
date = {2014-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {8726},
pages = {481-484},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {Boolean matrix decomposition is a method to obtain a compressed
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.},
keywords = {Boolean matrix decomposition, data mining, framework},
pubstate = {published},
tppubtype = {inproceedings}
}
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.
2013
Wicker, Jörg
Large Classifier Systems in Bio- and Cheminformatics PhD Thesis
Technische Universität München, 2013.
Abstract | Links | BibTeX | Tags: biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity
@phdthesis{wicker2013large,
title = {Large Classifier Systems in Bio- and Cheminformatics},
author = {J\"{o}rg Wicker},
url = {http://mediatum.ub.tum.de/node?id=1165858},
year = {2013},
date = {2013-01-01},
school = {Technische Universit\"{a}t M\"{u}nchen},
abstract = {Large classifier systems are machine learning algorithms that use multiple
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.},
keywords = {biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity},
pubstate = {published},
tppubtype = {phdthesis}
}
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.
2012
Wicker, Jörg; Pfahringer, Bernhard; Kramer, Stefan
Multi-label Classification Using Boolean Matrix Decomposition Proceedings Article
In: Proceedings of the 27th Annual ACM Symposium on Applied Computing, pp. 179–186, ACM, 2012, ISBN: 978-1-4503-0857-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: associations, Boolean matrix decomposition, machine learning, multi-label classification
@inproceedings{wicker2012multi,
title = {Multi-label Classification Using Boolean Matrix Decomposition},
author = {J\"{o}rg Wicker and Bernhard Pfahringer and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10032
http://doi.acm.org/10.1145/2245276.2245311},
doi = {10.1145/2245276.2245311},
isbn = {978-1-4503-0857-1},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 27th Annual ACM Symposium on Applied Computing},
pages = {179--186},
publisher = {ACM},
series = {SAC '12},
abstract = {This paper introduces a new multi-label classifier based on Boolean matrix decomposition. Boolean matrix decomposition is used to extract, from the full label matrix, latent labels representing useful Boolean combinations of the original labels. Base level models predict latent labels, which are subsequently transformed into the actual labels by Boolean matrix multiplication with the second matrix from the decomposition. The new method is tested on six publicly available datasets with varying numbers of labels. The experimental evaluation shows that the new method works particularly well on datasets with a large number of labels and strong dependencies among them.},
keywords = {associations, Boolean matrix decomposition, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
Hardy, Barry; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: Journal of Cheminformatics, vol. 2, no. 1, pp. 7, 2010, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity
@article{hardy2010collaborative,
title = {Collaborative development of predictive toxicology applications},
author = {Barry Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and J\"{o}rg Wicker and Andreas Karwath and Martin G\"{u}tlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://www.jcheminf.com/content/2/1/7},
doi = {10.1186/1758-2946-2-7},
issn = {1758-2946},
year = {2010},
date = {2010-01-01},
journal = {Journal of Cheminformatics},
volume = {2},
number = {1},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach Journal Article
In: Bioinformatics, vol. 26, no. 6, pp. 814-821, 2010.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways
@article{wicker2010predicting,
title = {Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
url = {http://bioinformatics.oxfordjournals.org/content/26/6/814.full},
doi = {10.1093/bioinformatics/btq024},
year = {2010},
date = {2010-01-01},
journal = {Bioinformatics},
volume = {26},
number = {6},
pages = {814-821},
publisher = {Oxford University Press},
abstract = {Motivation: Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this article, we propose a hybrid knowledge- and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. As the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice.Results: Results from leave-one-out cross-validation show that a recall and precision of ∼0.8 can be achieved for a subset of 13 transformation rules. Therefore, it is possible to optimize precision without compromising recall. We are currently integrating the results into an experimental version of the UM-PPS server.Availability: The program is freely available on the web at http://wwwkramer.in.tum.de/research/applications/biodegradation/data.Contact: kramer@in.tum.de},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Richter, Lothar; Kramer, Stefan
SINDBAD and SiQL: Overview, Applications and Future Developments Book Section
In: Džeroski, Sašo; Goethals, Bart; Panov, Panče (Ed.): Inductive Databases and Constraint-Based Data Mining, pp. 289-309, Springer New York, 2010, ISBN: 978-1-4419-7737-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@incollection{wicker2010sindbad,
title = {SINDBAD and SiQL: Overview, Applications and Future Developments},
author = {J\"{o}rg Wicker and Lothar Richter and Stefan Kramer},
editor = {Sa\v{s}o D\v{z}eroski and Bart Goethals and Pan\v{c}e Panov},
url = {http://dx.doi.org/10.1007/978-1-4419-7738-0_12},
doi = {10.1007/978-1-4419-7738-0_12},
isbn = {978-1-4419-7737-3},
year = {2010},
date = {2010-01-01},
booktitle = {Inductive Databases and Constraint-Based Data Mining},
pages = {289-309},
publisher = {Springer New York},
abstract = {The chapter gives an overview of the current state of the Sindbad system and planned extensions. Following an introduction to the system and its query language SiQL, we present application scenarios from the areas of gene expression/regulation and small molecules. Next, we describe a web service interface to Sindbad that enables new possibilities for inductive databases (distributing tasks over multiple servers, language and platform independence, …). Finally, we discuss future plans for the system, in particular, to make the system more ‘declarative’ by the use of signatures, to integrate the useful concept of mining views into the system, and to support specific pattern domains like graphs and strings.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {incollection}
}
2008
Wicker, Jörg; Richter, Lothar; Kessler, Kristina; Kramer, Stefan
SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Daelemans, Walter; Goethals, Bart; Morik, Katharina (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 690-694, Springer Berlin Heidelberg, 2008, ISBN: 978-3-540-87480-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbad,
title = {SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model},
author = {J\"{o}rg Wicker and Lothar Richter and Kristina Kessler and Stefan Kramer},
editor = {Walter Daelemans and Bart Goethals and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-540-87481-2_48},
doi = {10.1007/978-3-540-87481-2_48},
isbn = {978-3-540-87480-5},
year = {2008},
date = {2008-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {5212},
pages = {690-694},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {In this demonstration, we will present the concepts and an implementation of an inductive database \textendash as proposed by Imielinski and Mannila \textendash in the relational model. The goal is to support all steps of the knowledge discovery process on the basis of queries to a database system. The query language SiQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and software architecture significantly. The prototype is applied to three different data sets: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Richter, Lothar; Wicker, Jörg; Kessler, Kristina; Kramer, Stefan
An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology, pp. 740–744, ACM, 2008, ISBN: 978-1-59593-926-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{richter2008inductive,
title = {An Inductive Database and Query Language in the Relational Model},
author = {Lothar Richter and J\"{o}rg Wicker and Kristina Kessler and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10033
http://doi.acm.org/10.1145/1353343.1353440},
doi = {10.1145/1353343.1353440},
isbn = {978-1-59593-926-5},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology},
pages = {740--744},
publisher = {ACM},
series = {EDBT '08},
abstract = {In the demonstration, we will present the concepts and an implementation of an inductive database -- as proposed by Imielinski and Mannila -- in the relational model. The goal is to support all steps of the knowledge discovery process, from pre-processing via data mining to post-processing, on the basis of queries to a database system. The query language SIQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. Key concepts of this system, among others, are the closure of operators and distances between objects. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and the software architecture significantly. The prototype is applied to three different applications: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Brosdau, Christoph; Richter, Lothar; Kramer, Stefan
SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes Proceedings Article
In: Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery, 2008.
Abstract | Links | BibTeX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbadsails,
title = {SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes},
author = {J\"{o}rg Wicker and Christoph Brosdau and Lothar Richter and Stefan Kramer},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/sokd/2.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery},
abstract = {The paper presents SINDBAD SAILS (Service Architecture for Inductive Learning Schemes), a Web Service interface to the inductive database SINDBAD. To the best of our knowledge, it is the first time a Web Service interface is provided for an inductive database. The combination of service-oriented architectures and inductive databases is particularly useful, as it enables distributed data mining without the need to install specialized data mining or machine learning software. Moreover, inductive queries can easily be used in almost any kind of programming language. The paper discusses the underlying concepts and explains a sample program making use of SINDBAD SAILS.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction Proceedings Article
In: Bridewell, Will; Calders, Toon; Medeiros, Ana Karla; Kramer, Stefan; Pechenizkiy, Mykola; Todorovski, Ljupco (Ed.): Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008, 2008.
Links | BibTeX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways
@inproceedings{wicker2008machine,
title = {Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
editor = {Will Bridewell and Toon Calders and Ana Karla Medeiros and Stefan Kramer and Mykola Pechenizkiy and Ljupco Todorovski},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/ipm/9.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Kramer, Stefan; Aufschild, Volker; Hapfelmeier, Andreas; Jarasch, Alexander; Kessler, Kristina; Reckow, Stefan; Wicker, Jörg; Richter, Lothar
Inductive Databases in the Relational Model: The Data as the Bridge Proceedings Article
In: Bonchi, Francesco; Boulicaut, Jean-François (Ed.): Knowledge Discovery in Inductive Databases, pp. 124-138, Springer Berlin Heidelberg, 2006, ISBN: 978-3-540-33292-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{kramer2006inductive,
title = {Inductive Databases in the Relational Model: The Data as the Bridge},
author = {Stefan Kramer and Volker Aufschild and Andreas Hapfelmeier and Alexander Jarasch and Kristina Kessler and Stefan Reckow and J\"{o}rg Wicker and Lothar Richter},
editor = {Francesco Bonchi and Jean-Fran\c{c}ois Boulicaut},
url = {http://dx.doi.org/10.1007/11733492_8},
doi = {10.1007/11733492_8},
isbn = {978-3-540-33292-3},
year = {2006},
date = {2006-01-01},
booktitle = {Knowledge Discovery in Inductive Databases},
volume = {3933},
pages = {124-138},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {We present a new and comprehensive approach to inductive databases in the relational model. The main contribution is a new inductive query language extending SQL, with the goal of supporting the whole knowledge discovery process, from pre-processing via data mining to post-processing. A prototype system supporting the query language was developed in the SINDBAD (structured inductive database development) project. Setting aside models and focusing on distance-based and instance-based methods, closure can easily be achieved. An example scenario from the area of gene expression data analysis demonstrates the power and simplicity of the concept. We hope that this preliminary work will help to bring the fundamental issues, such as the integration of various pattern domains and data mining techniques, to the attention of the inductive database community.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}