2019
Roeslin, Samuel; Ma, Quincy; Wicker, Jörg; Wotherspoon, Liam
Data integration for the development of a seismic loss prediction model for residential buildings in New Zealand Proceedings Article
In: Cellier, Peggy; Driessens, Kurt (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 88-100, Springer International Publishing, Cham, 2019, ISBN: 978-3-030-43887-6.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: computational sustainability, earthquakes
@inproceedings{roeslin2019data,
title = {Data integration for the development of a seismic loss prediction model for residential buildings in New Zealand},
author = {Samuel Roeslin and Quincy Ma and J\"{o}rg Wicker and Liam Wotherspoon},
editor = {Peggy Cellier and Kurt Driessens},
url = {https://link.springer.com/chapter/10.1007/978-3-030-43887-6_8},
doi = {10.1007/978-3-030-43887-6_8},
isbn = {978-3-030-43887-6},
year = {2019},
date = {2019-09-19},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
pages = {88-100},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {In 2010--2011, New Zealand experienced the most damaging earthquakes in its history. It led to extensive damage to Christchurch buildings, infrastructure and its surroundings; affecting commercial and residential buildings. The direct economic losses represented 20% of New Zealand's GDP in 2011. Owing to New Zealand's particular insurance structure, the insurance sector contributed to over 80% of losses for a total of more than NZ$31 billion. Amongst this, over NZ$11 billion of the losses arose from residential building claims and were covered either partially or entirely from the NZ government backed Earthquake Commission (EQC) cover insurance scheme. In the process of resolving the claims, EQC collected detailed financial loss data, post-event observations and building characteristics for each of the approximately 434,000 claims lodged following the Canterbury Earthquake sequence (CES). Added to this, the active NZ earthquake engineering community treated the event as a large scale outdoor experiment and collected extensive data on the ground shaking levels, soil conditions, and liquefaction occurrence throughout wider Christchurch. This paper discusses the necessary data preparation process preceding the development of a machine learning seismic loss model. The process draws heavily upon using Geographic Information System (GIS) techniques to aggregate relevant information from multiple databases interpolating data between categories and converting data between continuous and categorical forms. Subsequently, the database is processed, and a residential seismic loss prediction model is developed using machine learning. The aim is to develop a `grey-box' model enabling human interpretability of the decision steps.},
keywords = {computational sustainability, earthquakes},
pubstate = {published},
tppubtype = {inproceedings}
}
Williams, Jonathan; Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Krauter, Nicolas; Wicker, Jörg; Kramer, Stefan
What can we learn from the air chemistry of crowds? Proceedings Article
In: Hansel, Armin; Dunkl, Jürgen (Ed.): 8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications, pp. 121-123, Innsbruck University Press, Innsbruck, 2019.
Abstract | Links | BibTeX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@inproceedings{williams2019what,
title = {What can we learn from the air chemistry of crowds?},
author = {Jonathan Williams and Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Nicolas Krauter and J\"{o}rg Wicker and Stefan Kramer},
editor = {Armin Hansel and J\"{u}rgen Dunkl},
url = {https://www.ionicon.com/sites/default/files/uploads/doc/Contributions_8th-PTR-MS-Conference-2019_web.pdf#page=122},
year = {2019},
date = {2019-05-10},
booktitle = {8th International Conference on Proton Transfer Reaction Mass Spectrometry and its Applications},
pages = {121-123},
publisher = {Innsbruck University Press},
address = {Innsbruck},
abstract = {Current PTR-MS technology allows hundreds of volatile trace gases in air to be measured every second at extremely low levels (parts per trillion). These instruments are often used in atmospheric research on planes and ships and even in the Amazon rainforest. Recently, we have used this technology to examine air composition changes caused by large groups of people (10,000-30,000) under real world conditions at a football match and in a movie theater. In both cases the trace gas signatures measured in ambient air are shown to reflect crowd behavior. By applying advanced data mining techniques we have shown that groups of people reproducibly respond to certain emotional stimuli (e.g. suspense and comedy) by exhaling specific trace gases. Furthermore, we explore whether this information can be used to determine the age classification of films.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Dabiri, Yasamin; Gama-Brambila, Rodrigo A.; Taskova, Katerina; Herold, Kristina; Reuter, Stefanie; Adjaye, James; Utikal, Jochen; Mrowka, Ralf; Wang, Jichang; Andrade-Navarro, Miguel A.; Cheng, Xinlai
Imidazopyridines as Potent KDM5 Demethylase Inhibitors Promoting Reprogramming Efficiency of Human iPSCs Journal Article
In: iScience, vol. 12, pp. 168-181, 2019, ISSN: 2589-0042.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Biochemistry, Biological Sciences, Molecular Biology
@article{Dabiri2019Imidazopyridines,
title = {Imidazopyridines as Potent KDM5 Demethylase Inhibitors Promoting Reprogramming Efficiency of Human iPSCs},
author = {Yasamin Dabiri and Rodrigo A. Gama-Brambila and Katerina Taskova and Kristina Herold and Stefanie Reuter and James Adjaye and Jochen Utikal and Ralf Mrowka and Jichang Wang and Miguel A. Andrade-Navarro and Xinlai Cheng},
url = {https://www.sciencedirect.com/science/article/pii/S2589004219300124},
doi = {https://doi.org/10.1016/j.isci.2019.01.012},
issn = {2589-0042},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
journal = {iScience},
volume = {12},
pages = {168-181},
abstract = {Summary
Pioneering human induced pluripotent stem cell (iPSC)-based pre-clinical studies have raised safety concerns and pinpointed the need for safer and more efficient approaches to generate and maintain patient-specific iPSCs. One approach is searching for compounds that influence pluripotent stem cell reprogramming using functional screens of known drugs. Our high-throughput screening of drug-like hits showed that imidazopyridines\textemdashanalogs of zolpidem, a sedative-hypnotic drug\textemdashare able to improve reprogramming efficiency and facilitate reprogramming of resistant human primary fibroblasts. The lead compound (O4I3) showed a remarkable OCT4 induction, which at least in part is due to the inhibition of H3K4 demethylase (KDM5, also known as JARID1). Experiments demonstrated that KDM5A, but not its homolog KDM5B, serves as a reprogramming barrier by interfering with the enrichment of H3K4Me3 at the OCT4 promoter. Thus our results introduce a new class of KDM5 chemical inhibitors and provide further insight into the pluripotency-related properties of KDM5 family members.},
keywords = {Biochemistry, Biological Sciences, Molecular Biology},
pubstate = {published},
tppubtype = {article}
}
Pioneering human induced pluripotent stem cell (iPSC)-based pre-clinical studies have raised safety concerns and pinpointed the need for safer and more efficient approaches to generate and maintain patient-specific iPSCs. One approach is searching for compounds that influence pluripotent stem cell reprogramming using functional screens of known drugs. Our high-throughput screening of drug-like hits showed that imidazopyridines—analogs of zolpidem, a sedative-hypnotic drug—are able to improve reprogramming efficiency and facilitate reprogramming of resistant human primary fibroblasts. The lead compound (O4I3) showed a remarkable OCT4 induction, which at least in part is due to the inhibition of H3K4 demethylase (KDM5, also known as JARID1). Experiments demonstrated that KDM5A, but not its homolog KDM5B, serves as a reprogramming barrier by interfering with the enrichment of H3K4Me3 at the OCT4 promoter. Thus our results introduce a new class of KDM5 chemical inhibitors and provide further insight into the pluripotency-related properties of KDM5 family members.
Taškova, Katerina; Fontaine, Jean-Fred; Mrowka, Ralf; Andrade-Navarro, Miguel A.
Literature optimized integration of gene expression for organ-specific evaluation of toxicogenomics datasets Journal Article
In: PLOS ONE, vol. 14, no. 1, pp. 1-21, 2019.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bioinformatics
@article{10.1371/journal.pone.0210467,
title = {Literature optimized integration of gene expression for organ-specific evaluation of toxicogenomics datasets},
author = {Katerina Ta\v{s}kova and Jean-Fred Fontaine and Ralf Mrowka and Miguel A. Andrade-Navarro},
url = {https://doi.org/10.1371/journal.pone.0210467},
doi = {10.1371/journal.pone.0210467},
year = {2019},
date = {2019-01-01},
urldate = {2019-01-01},
journal = {PLOS ONE},
volume = {14},
number = {1},
pages = {1-21},
publisher = {Public Library of Science},
abstract = {The study of drug toxicity in human organs is complicated by their complex inter-relations and by the obvious difficulty to testing drug effects on biologically relevant material. Animal models and human cell cultures offer alternatives for systematic and large-scale profiling of drug effects on gene expression level, as typically found in the so-called toxicogenomics datasets. However, the complexity of these data, which includes variable drug doses, time points, and experimental setups, makes it difficult to choose and integrate the data, and to evaluate the appropriateness of one or another model system to study drug toxicity (of particular drugs) of particular human organs. Here, we define a protocol to integrate drug-wise rankings of gene expression changes in toxicogenomics data, which we apply to the TG-GATEs dataset, to prioritize genes for association to drug toxicity in liver or kidney. Contrast of the results with sets of known human genes associated to drug toxicity in the literature allows to compare different rank aggregation approaches for the task at hand. Collectively, ranks from multiple models point to genes not previously associated to toxicity, notably, the PCNA clamp associated factor (PCLAF), and genes regulated by the master regulator of the antioxidant response NFE2L2, such as NQO1 and SRXN1. In addition, comparing gene ranks from different models allowed us to evaluate striking differences in terms of toxicity-associated genes between human and rat hepatocytes or between rat liver and rat hepatocytes. We interpret these results to point to the different molecular functions associated to organ toxicity that are best described by each model. We conclude that the expected production of toxicogenomics panels with larger numbers of drugs and models, in combination with the ongoing increase of the experimental literature in organ toxicity, will lead to increasingly better associations of genes for organism toxicity.},
keywords = {bioinformatics},
pubstate = {published},
tppubtype = {article}
}
2018
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Proof of concept study: Testing human volatile organic compounds as tools for age classification of films Journal Article
In: PLOS One, vol. 13, no. 10, pp. 1-14, 2018.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series
@article{Stonner2018,
title = {Proof of concept study: Testing human volatile organic compounds as tools for age classification of films},
author = {Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
doi = {10.1371/journal.pone.0203044},
year = {2018},
date = {2018-10-11},
journal = {PLOS One},
volume = {13},
number = {10},
pages = {1-14},
publisher = {Public Library of Science},
abstract = {Humans emit numerous volatile organic compounds (VOCs) through breath and skin. The nature and rate of these emissions are affected by various factors including emotional state. Previous measurements of VOCs and CO2 in a cinema have shown that certain chemicals are reproducibly emitted by audiences reacting to events in a particular film. Using data from films with various age classifications, we have studied the relationship between the emission of multiple VOCs and CO2 and the age classifier (0, 6, 12, and 16) with a view to developing a new chemically based and objective film classification method. We apply a random forest model built with time independent features extracted from the time series of every measured compound, and test predictive capability on subsets of all data. It was found that most compounds were not able to predict all age classifiers reliably, likely reflecting the fact that current classification is based on perceived sensibilities to many factors (e.g. incidences of violence, sex, antisocial behaviour, drug use, and bad language) rather than the visceral biological responses expressed in the data. However, promising results were found for isoprene which reliably predicted 0, 6 and 12 age classifiers for a variety of film genres and audience age groups. Therefore, isoprene emission per person might in future be a valuable aid to national classification boards, or even offer an alternative, objective, metric for rating films based on the reactions of large groups of people.},
keywords = {atmospheric chemistry, breath analysis, cheminformatics, cinema data mining, data mining, emotional response analysis, machine learning, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
Stönner, Christof; Edtbauer, Achim; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Wicker, Jörg; Williams, Jonathan
Investigating human emissions of volatile organic compounds in a cinema, flux rates, links to scene content, and possible applications Proceedings Article
In: 15th Conference of the International Society of Indoor Air Quality and Climate, INDOOR AIR 2018, International Society of Indoor Air Quality and Climate, 2018, ISBN: 978-171382651-4.
Abstract | BibTeX | Tags: atmospheric chemistry, cheminformatics, cinema data mining, sof
@inproceedings{St\"{o}nner2018investigating,
title = {Investigating human emissions of volatile organic compounds in a cinema, flux rates, links to scene content, and possible applications},
author = {Christof St\"{o}nner and Achim Edtbauer and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and J\"{o}rg Wicker and Jonathan Williams},
isbn = {978-171382651-4},
year = {2018},
date = {2018-07-22},
urldate = {2018-07-22},
booktitle = {15th Conference of the International Society of Indoor Air Quality and Climate, INDOOR AIR 2018},
publisher = {International Society of Indoor Air Quality and Climate},
abstract = {Humans emit numerous volatile organic compounds (VOCs) into the air via skin and breath. These emissions can depend on various factors such as nutrition, sporting activity and also the emotional state. It is shown that the emission rates of the main endogenous breath gases like CO2, acetone and isoprene are generally lower for children than for adults. In contrast, VOCs from exogenous sources strongly vary over the course of day. Interestingly, small scale variances in emission rates were found to occur reproducibly over multiple screenings of the same film. The peaks occurring in the time series of a compound during the screening of the film were induced by the physiological response of the audience to audio-visual stimuli. Additionally, the question whether this chemical reaction of the audience can be used for the prediction of age classification of films is addressed.},
keywords = {atmospheric chemistry, cheminformatics, cinema data mining, sof},
pubstate = {published},
tppubtype = {inproceedings}
}
Theobald, Jannick; Ghanem, Ali; Wallisch, Patrick; Banaeiyan, Amin A.; Andrade-Navarro, Miguel A.; Taškova, Katerina; Haltmeier, Manuela; Kurtz, Andreas; Becker, Holger; Reuter, Stefanie; Mrowka, Ralf; Cheng, Xinlai; Wölfl, Stefan
Liver-Kidney-on-Chip To Study Toxicity of Drug Metabolites Journal Article
In: ACS Biomaterials Science & Engineering, vol. 4, no. 1, pp. 78-89, 2018, (PMID: 33418680).
Links | BibTeX | Altmetric | PlumX | Tags: bioinformatics
@article{Theobald2018Liver,
title = {Liver-Kidney-on-Chip To Study Toxicity of Drug Metabolites},
author = {Jannick Theobald and Ali Ghanem and Patrick Wallisch and Amin A. Banaeiyan and Miguel A. Andrade-Navarro and Katerina Ta\v{s}kova and Manuela Haltmeier and Andreas Kurtz and Holger Becker and Stefanie Reuter and Ralf Mrowka and Xinlai Cheng and Stefan W\"{o}lfl},
url = {https://doi.org/10.1021/acsbiomaterials.7b00417},
doi = {10.1021/acsbiomaterials.7b00417},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
journal = {ACS Biomaterials Science \& Engineering},
volume = {4},
number = {1},
pages = {78-89},
note = {PMID: 33418680},
keywords = {bioinformatics},
pubstate = {published},
tppubtype = {article}
}
Mah, Nancy; Taskova, Katerina; Amrani, Khadija El; Hariharan, Krithika; Kurtz, Andreas; Andrade-Navarro, Miguel A.
Evaluating Cell Identity from Transcription Profiles Journal Article
In: bioRxiv, 2018.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: bioinformatics
@article{Mah250431,
title = {Evaluating Cell Identity from Transcription Profiles},
author = {Nancy Mah and Katerina Taskova and Khadija El Amrani and Krithika Hariharan and Andreas Kurtz and Miguel A. Andrade-Navarro},
url = {https://www.biorxiv.org/content/early/2018/01/19/250431},
doi = {10.1101/250431},
year = {2018},
date = {2018-01-01},
urldate = {2018-01-01},
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Induced pluripotent stem cells (iPS) and direct lineage programming offer promising autologous and patient-specific sources of cells for personalized drug-testing and cell-based therapy. Before these engineered cells can be widely used, it is important to evaluate how well the engineered cell types resemble their intended target cell types. We have developed a method to generate CellScore, a cell identity score that can be used to evaluate the success of an engineered cell type in relation to both its initial and desired target cell type, which are used as references. Of 20 cell transitions tested, the most successful transitions were the iPS cells (CellScore \> 0.9), while other transitions (e.g. induced hepatocytes or motor neurons) indicated incomplete transitions (CellScore \< 0.5). In principle, the method can be applied to any engineered cell undergoing a cell transition, where transcription profiles are available for the reference cell types and the engineered cell type.HighlightsA curated standard dataset of transcription profiles from normal cell types was created.CellScore evaluates the cell identity of engineered cell types, using the curated dataset.CellScore considers the initial and desired target cell type.CellScore identifies the most successfully engineered clones for further functional testing.},
keywords = {bioinformatics},
pubstate = {published},
tppubtype = {article}
}
Taškova, Katerina; Fontaine, Jean-Fred; Mrowka, Ralf; Andrade-Navarro, Miguel A.
Evaluation of in vivo and in vitro models of toxicity by comparison of toxicogenomics data with the literature Journal Article
In: Methods, vol. 132, pp. 57-65, 2018, ISSN: 1046-2023, (Comparison and Visualization Methods for High-Dimensional Biological Data).
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Differential expression analysis, Functional enrichment analysis, Literature co-occurrence analysis, model systems, Toxicogenomics data integration
@article{TASKOVA201857,
title = {Evaluation of in vivo and in vitro models of toxicity by comparison of toxicogenomics data with the literature},
author = {Katerina Ta\v{s}kova and Jean-Fred Fontaine and Ralf Mrowka and Miguel A. Andrade-Navarro},
url = {https://www.sciencedirect.com/science/article/pii/S1046202317300543},
doi = {https://doi.org/10.1016/j.ymeth.2017.07.010},
issn = {1046-2023},
year = {2018},
date = {2018-01-01},
journal = {Methods},
volume = {132},
pages = {57-65},
abstract = {Toxicity affecting humans is studied by observing the effects of chemical substances in animal organisms (in vivo) or in animal and human cultivated cell lines (in vitro). Toxicogenomics studies collect gene expression profiles and histopathology assessment data for hundreds of drugs and pollutants in standardized experimental designs using different model systems. These data are an invaluable source for analyzing genome-wide drug response in biological systems. However, a problem remains that is how to evaluate the suitability of heterogeneous in vitro and in vivo systems to model the many different aspects of human toxicity. We propose here that a given model system (cell type or animal organ) is supported to appropriately describe a particular aspect of human toxicity if the set of compounds associated in the literature with that aspect of toxicity causes a change in expression of genes with a particular function in the tested model system. This approach provides candidate genes to explain the toxicity effect (the differentially expressed genes) and the compounds whose effect could be modeled (the ones producing both the change of expression in the model system and that are associated with the human phenotype in the literature). Here we present an application of this approach using a computational pipeline that integrates compound-induced gene expression profiles (from the Open TG-GATEs database) and biomedical literature annotations (from the PubMed database) to evaluate the suitability of (human and rat) in vitro systems as well as rat in vivo systems to model human toxicity.},
note = {Comparison and Visualization Methods for High-Dimensional Biological Data},
keywords = {Differential expression analysis, Functional enrichment analysis, Literature co-occurrence analysis, model systems, Toxicogenomics data integration},
pubstate = {published},
tppubtype = {article}
}
2017
Wicker, Jörg; Kramer, Stefan
The Best Privacy Defense is a Good Privacy Offense: Obfuscating a Search Engine User’s Profile Journal Article
In: Data Mining and Knowledge Discovery, vol. 31, no. 5, pp. 1419-1443, 2017, ISSN: 1573-756X.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: adversarial learning, machine learning, personalized ads, privacy, reinforcement learning, search engines
@article{wicker2017best,
title = {The Best Privacy Defense is a Good Privacy Offense: Obfuscating a Search Engine User's Profile},
author = {J\"{o}rg Wicker and Stefan Kramer},
editor = {Kurt Driessens and Dragi Kocev and Marko Robnik-\v{S}ikonja and Myra Spiliopoulou},
url = {http://rdcu.be/tL0U},
doi = {10.1007/s10618-017-0524-z},
issn = {1573-756X},
year = {2017},
date = {2017-09-01},
journal = {Data Mining and Knowledge Discovery},
volume = {31},
number = {5},
pages = {1419-1443},
abstract = {User privacy on the internet is an important and unsolved problem. So far, no sufficient and comprehensive solution has been proposed that helps a user to protect his or her privacy while using the internet. Data are collected and assembled by numerous service providers. Solutions so far focused on the side of the service providers to store encrypted or transformed data that can be still used for analysis. This has a major flaw, as it relies on the service providers to do this. The user has no chance of actively protecting his or her privacy. In this work, we suggest a new approach, empowering the user to take advantage of the same tool the other side has, namely data mining to produce data which obfuscates the user’s profile. We apply this approach to search engine queries and use feedback of the search engines in terms of personalized advertisements in an algorithm similar to reinforcement learning to generate new queries potentially confusing the search engine. We evaluated the approach using a real-world data set. While evaluation is hard, we achieve results that indicate that it is possible to influence the user’s profile that the search engine generates. This shows that it is feasible to defend a user’s privacy from a new and more practical perspective.},
keywords = {adversarial learning, machine learning, personalized ads, privacy, reinforcement learning, search engines},
pubstate = {published},
tppubtype = {article}
}
Latino, Diogo; Wicker, Jörg; Gütlein, Martin; Schmid, Emanuel; Kramer, Stefan; Fenner, Kathrin
Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data Journal Article
In: Environmental Science: Process & Impact, 2017.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, multi-label classification, REST, web services
@article{latino2017eawag,
title = {Eawag-Soil in enviPath: a new resource for exploring regulatory pesticide soil biodegradation pathways and half-life data},
author = {Diogo Latino and J\"{o}rg Wicker and Martin G\"{u}tlein and Emanuel Schmid and Stefan Kramer and Kathrin Fenner},
doi = {10.1039/C6EM00697C},
year = {2017},
date = {2017-01-01},
journal = {Environmental Science: Process \& Impact},
publisher = {The Royal Society of Chemistry},
abstract = {Developing models for the prediction of microbial biotransformation pathways and half-lives of trace organic contaminants in different environments requires as training data easily accessible and sufficiently large collections of respective biotransformation data that are annotated with metadata on study conditions. Here, we present the Eawag-Soil package, a public database that has been developed to contain all freely accessible regulatory data on pesticide degradation in laboratory soil simulation studies
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, multi-label classification, REST, web services},
pubstate = {published},
tppubtype = {article}
}
for pesticides registered in the EU (282 degradation pathways, 1535 reactions, 1619 compounds and 4716 biotransformation half-life values with corresponding metadata on study conditions). We provide a thorough description of this novel data resource, and discuss important features of the pesticide soil degradation data that are relevant for model development. Most notably, the variability of half-life values for individual compounds is large and only about one order of magnitude lower than the entire range of median half-life values spanned by all compounds, demonstrating the need to consider study conditions in the development of more accurate models for biotransformation prediction. We further show how the data can be used to find missing rules relevant for predicting soil biotransformation pathways. From this analysis, eight examples of reaction types were presented that should trigger the formulation of new biotransformation rules, e.g., Ar-OH methylation, or the extension of existing rules e.g., hydroxylation in aliphatic rings. The data were also used to exemplarily explore the dependence of half-lives of different amide pesticides on chemical class and experimental parameters. This analysis highlighted the value of considering initial transformation reactions for the development of meaningful quantitative-structure biotransformation relationships (QSBR), which is a novel opportunity of f ered by the simultaneous encoding of transformation reactions and corresponding half-lives in Eawag-Soil. Overall, Eawag-Soil provides an unprecedentedly rich collection of manually extracted and curated biotransformation data, which should be useful in a great variety of applications.
2016
Wicker, Jörg; Fenner, Kathrin; Kramer, Stefan
A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction Book Section
In: Lässig, Jörg; Kersting, Kristian; Morik, Katharina (Ed.): Computational Sustainability, pp. 75-97, Springer International Publishing, Cham, 2016, ISBN: 978-3-319-31858-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, multi-label classification
@incollection{wicker2016ahybrid,
title = {A Hybrid Machine Learning and Knowledge Based Approach to Limit Combinatorial Explosion in Biodegradation Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Stefan Kramer},
editor = {J\"{o}rg L\"{a}ssig and Kristian Kersting and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-319-31858-5_5},
doi = {10.1007/978-3-319-31858-5_5},
isbn = {978-3-319-31858-5},
year = {2016},
date = {2016-04-21},
booktitle = {Computational Sustainability},
pages = {75-97},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {One of the main tasks in chemical industry regarding the sustainability of a product is the prediction of its environmental fate, i.e., its degradation products and pathways. Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this chapter, we propose a hybrid knowledge-based and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. Since the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice. Results from leave-one-out cross-validation show that a recall and precision of approximately 0.8 can be achieved for a subset of 13 transformation rules. The set of used rules is further extended using multi-label classification, where dependencies among the transformation rules are exploited to improve the predictions. While the results regarding recall and precision vary, the area under the ROC curve can be improved using multi-label classification. Therefore, it is possible to optimize precision without compromising recall. Recently, we integrated the presented approach into enviPath, a complete redesign and re-implementation of UM-PPS.},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {incollection}
}
Wicker, Jörg; Tyukin, Andrey; Kramer, Stefan
A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders Proceedings Article
In: Bailey, James; Khan, Latifur; Washio, Takashi; Dobbie, Gill; Huang, Zhexue Joshua; Wang, Ruili (Ed.): The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 328-340, Springer International Publishing, Switzerland, 2016, ISBN: 978-3-319-31753-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: autoencoders, label compression, machine learning, multi-label classification
@inproceedings{wicker2016nonlinear,
title = {A Nonlinear Label Compression and Transformation Method for Multi-Label Classification using Autoencoders},
author = {J\"{o}rg Wicker and Andrey Tyukin and Stefan Kramer},
editor = {James Bailey and Latifur Khan and Takashi Washio and Gill Dobbie and Zhexue Joshua Huang and Ruili Wang},
url = {http://dx.doi.org/10.1007/978-3-319-31753-3_27},
doi = {10.1007/978-3-319-31753-3_27},
isbn = {978-3-319-31753-3},
year = {2016},
date = {2016-04-16},
booktitle = {The 20th Pacific Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
volume = {9651},
pages = {328-340},
publisher = {Springer International Publishing},
address = {Switzerland},
series = {Lecture Notes in Computer Science},
abstract = {Multi-label classification targets the prediction of multiple interdependent and non-exclusive binary target variables. Transformation-based algorithms transform the data set such that regular single-label algorithms can be applied to the problem. A special type of transformation-based classifiers are label compression methods, that compress the labels and then mostly use single label classifiers to predict the compressed labels. So far, there are no compression-based algorithms follow a problem transformation approach and address non-linear dependencies in the labels. In this paper, we propose a new algorithm, called Maniac (Multi-lAbel classificatioN usIng AutoenCoders), which extracts the non-linear dependencies by compressing the labels using autoencoders. We adapt the training process of autoencoders in a way to make them more suitable for a parameter optimization in the context of this algorithm. The method is evaluated on eight standard multi-label data sets. Experiments show that despite not producing a good ranking, Maniac generates a particularly good bipartition of the labels into positives and negatives. This is caused by rather strong predictions with either really high or low probability. Additionally, the algorithm seems to perform better given more labels and a higher label cardinality in the data set.},
keywords = {autoencoders, label compression, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Lorsbach, Tim; Gütlein, Martin; Schmid, Emanuel; Latino, Diogo; Kramer, Stefan; Fenner, Kathrin
enviPath – The Environmental Contaminant Biotransformation Pathway Resource Journal Article
In: Nucleic Acid Research, vol. 44, no. D1, pp. D502-D508, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification
@article{wicker2016envipath,
title = {enviPath - The Environmental Contaminant Biotransformation Pathway Resource},
author = {J\"{o}rg Wicker and Tim Lorsbach and Martin G\"{u}tlein and Emanuel Schmid and Diogo Latino and Stefan Kramer and Kathrin Fenner},
editor = {Michael Galperin},
url = {http://nar.oxfordjournals.org/content/44/D1/D502.abstract},
doi = {10.1093/nar/gkv1229},
year = {2016},
date = {2016-01-01},
journal = {Nucleic Acid Research},
volume = {44},
number = {D1},
pages = {D502-D508},
abstract = {The University of Minnesota Biocatalysis/Biodegradation Database and Pathway Prediction System (UM-BBD/PPS) has been a unique resource covering microbial biotransformation pathways of primarily xenobiotic chemicals for over 15 years. This paper introduces the successor system, enviPath (The Environmental Contaminant Biotransformation Pathway Resource), which is a complete redesign and reimplementation of UM-BBD/PPS. enviPath uses the database from the UM-BBD/PPS as a basis, extends the use of this database, and allows users to include their own data to support multiple use cases. Relative reasoning is supported for the refinement of predictions and to allow its extensions in terms of previously published, but not implemented machine learning models. User access is simplified by providing a REST API that simplifies the inclusion of enviPath into existing workflows. An RDF database is used to enable simple integration with other databases. enviPath is publicly available at https://envipath.org with free and open access to its core data.},
keywords = {biodegradation, cheminformatics, computational sustainability, data mining, enviPath, linked data, machine learning, metabolic pathways, multi-label classification},
pubstate = {published},
tppubtype = {article}
}
Raza, Atif; Wicker, Jörg; Kramer, Stefan
Trading Off Accuracy for Efficiency by Randomized Greedy Warping Proceedings Article
In: Proceedings of the 31st Annual ACM Symposium on Applied Computing, pp. 883-890, ACM, New York, NY, USA, 2016, ISBN: 978-1-4503-3739-7.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, dynamic time warping, time series
@inproceedings{raza2016trading,
title = {Trading Off Accuracy for Efficiency by Randomized Greedy Warping},
author = {Atif Raza and J\"{o}rg Wicker and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10030
http://doi.acm.org/10.1145/2851613.2851651},
doi = {10.1145/2851613.2851651},
isbn = {978-1-4503-3739-7},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 31st Annual ACM Symposium on Applied Computing},
pages = {883-890},
publisher = {ACM},
address = {New York, NY, USA},
series = {SAC '16},
abstract = {Dynamic Time Warping (DTW) is a widely used distance measure for time series data mining. Its quadratic complexity requires the application of various techniques (e.g. warping constraints, lower-bounds) for deployment in real-time scenarios. In this paper we propose a randomized greedy warping algorithm for f i nding similarity between time series instances.We show that the proposed algorithm outperforms the simple greedy approach and also provides very good time series similarity approximation consistently, as compared to DTW. We show that the Randomized Time Warping (RTW) can be used in place of DTW as a fast similarity approximation technique by trading some classification accuracy for very fast classification.},
keywords = {data mining, dynamic time warping, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Williams, Jonathan; Stönner, Christof; Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Kramer, Stefan
Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath Journal Article
In: Scientific Reports, vol. 6, 2016.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series
@article{williams2015element,
title = {Cinema audiences reproducibly vary the chemical composition of air during films, by broadcasting scene specific emissions on breath},
author = {Jonathan Williams and Christof St\"{o}nner and J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Stefan Kramer},
url = {http://www.nature.com/articles/srep25464},
doi = {10.1038/srep25464},
year = {2016},
date = {2016-01-01},
urldate = {2016-01-01},
journal = {Scientific Reports},
volume = {6},
publisher = {Nature Publishing Group},
abstract = {Human beings continuously emit chemicals into the air by breath and through the skin. In order to determine whether these emissions vary predictably in response to audiovisual stimuli, we have continuously monitored carbon dioxide and over one hundred volatile organic compounds in a cinema. It was found that many airborne chemicals in cinema air varied distinctively and reproducibly with time for a particular film, even in different screenings to different audiences. Application of scene labels and advanced data mining methods revealed that specific film events, namely "suspense" or "comedy" caused audiences to change their emission of specific chemicals. These event-type synchronous, broadcasted human chemosignals open the possibility for objective and non-invasive assessment of a human group response to stimuli by continuous measurement of chemicals in air. Such methods can be applied to research fields such as psychology and biology, and be valuable to industries such as film making and advertising.},
keywords = {atmospheric chemistry, causality, cheminformatics, data mining, emotional response analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {article}
}
2015
Wicker, Jörg; Krauter, Nicolas; Derstorff, Bettina; Stönner, Christof; Bourtsoukidis, Efstratios; Klüpfel, Thomas; Williams, Jonathan; Kramer, Stefan
Cinema Data Mining: The Smell of Fear Proceedings Article
In: Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1235-1304, ACM ACM, New York, NY, USA, 2015, ISBN: 978-1-4503-3664-2.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series
@inproceedings{wicker2015cinema,
title = {Cinema Data Mining: The Smell of Fear},
author = {J\"{o}rg Wicker and Nicolas Krauter and Bettina Derstorff and Christof St\"{o}nner and Efstratios Bourtsoukidis and Thomas Kl\"{u}pfel and Jonathan Williams and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10031
http://doi.acm.org/10.1145/2783258.2783404},
doi = {10.1145/2783258.2783404},
isbn = {978-1-4503-3664-2},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {1235-1304},
publisher = {ACM},
address = {New York, NY, USA},
organization = {ACM},
series = {KDD '15},
abstract = {While the physiological response of humans to emotional events or stimuli is well-investigated for many modalities (like EEG, skin resistance, ...), surprisingly little is known about the exhalation of so-called Volatile Organic Compounds (VOCs) at quite low concentrations in response to such stimuli. VOCs are molecules of relatively small mass that quickly evaporate or sublimate and can be detected in the air that surrounds us. The paper introduces a new field of application for data mining, where trace gas responses of people reacting on-line to films shown in cinemas (or movie theaters) are related to the semantic content of the films themselves. To do so, we measured the VOCs from a movie theatre over a whole month in intervals of thirty seconds, and annotated the screened films by a controlled vocabulary compiled from multiple sources. To gain a better understanding of the data and to reveal unknown relationships, we have built prediction models for so-called forward prediction (the prediction of future VOCs from the past), backward prediction (the prediction of past scene labels from future VOCs) and for some forms of abductive reasoning and Granger causality. Experimental results show that some VOCs and some labels can be predicted with relatively low error, and that hints for causality with low p-values can be detected in the data.},
keywords = {atmospheric chemistry, breath analysis, causality, cheminformatics, cinema data mining, data mining, emotional response analysis, movie analysis, smell of fear, sof, time series},
pubstate = {published},
tppubtype = {inproceedings}
}
Tyukin, Andrey; Kramer, Stefan; Wicker, Jörg
Scavenger – A Framework for the Efficient Evaluation of Dynamic and Modular Algorithms Proceedings Article
In: Bifet, Albert; May, Michael; Zadrozny, Bianca; Gavalda, Ricard; Pedreschi, Dino; Cardoso, Jaime; Spiliopoulou, Myra (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 325-328, Springer International Publishing, 2015, ISBN: 978-3-319-23460-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: autoencoders, distributed processing, framework, large-scale, Scavenger
@inproceedings{tyukin2015scavenger,
title = {Scavenger - A Framework for the Efficient Evaluation of Dynamic and Modular Algorithms},
author = {Andrey Tyukin and Stefan Kramer and J\"{o}rg Wicker},
editor = {Albert Bifet and Michael May and Bianca Zadrozny and Ricard Gavalda and Dino Pedreschi and Jaime Cardoso and Myra Spiliopoulou},
url = {http://dx.doi.org/10.1007/978-3-319-23461-8_40},
doi = {10.1007/978-3-319-23461-8_40},
isbn = {978-3-319-23460-1},
year = {2015},
date = {2015-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {9286},
pages = {325-328},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
abstract = {Machine Learning methods and algorithms are often highly modular in the sense that they rely on a large number of subalgorithms that are in principle interchangeable. For example, it is often possible to use various kinds of pre- and post-processing and various base classifiers or regressors as components of the same modular approach. We propose a framework, called Scavenger, that allows evaluating whole families of conceptually similar algorithms efficiently. The algorithms are represented as compositions, couplings and products of atomic subalgorithms. This allows partial results to be cached and shared between different instances of a modular algorithm, so that potentially expensive partial results need not be recomputed multiple times. Furthermore, our framework deals with issues of the parallel execution, load balancing, and with the backup of partial results for the case of implementation or runtime errors. Scavenger is licensed under the GPLv3 and can be downloaded freely at https://github.com/jorro/scavenger.},
keywords = {autoencoders, distributed processing, framework, large-scale, Scavenger},
pubstate = {published},
tppubtype = {inproceedings}
}
Benedik, Blaž; Taškova, Katerina; Tavčar, Jože; Duhovnik, Jože
Prediction of vacuum cleaner motor brush life: a regression approach Journal Article
In: IET Electric Power Applications, vol. 9, no. 9, pp. 569-577, 2015.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: brushes, carbon brush wear modelling, domestic appliances, dominant wear mechanism, electric motors, energy saving, field theory, motor design reliability, multiple regression analysis, vacuum cleaner motor brush life prediction, wear
@article{https://doi.org/10.1049/iet-epa.2014.0437,
title = {Prediction of vacuum cleaner motor brush life: a regression approach},
author = {Bla\v{z} Benedik and Katerina Ta\v{s}kova and Jo\v{z}e Tav\v{c}ar and Jo\v{z}e Duhovnik},
url = {https://ietresearch.onlinelibrary.wiley.com/doi/abs/10.1049/iet-epa.2014.0437},
doi = {https://doi.org/10.1049/iet-epa.2014.0437},
year = {2015},
date = {2015-01-01},
urldate = {2015-01-01},
journal = {IET Electric Power Applications},
volume = {9},
number = {9},
pages = {569-577},
abstract = {The main focus of this paper is the empirical modelling of the wear of carbon brushes. Rather than determining the dominant wear mechanisms, an approach towards the prediction of wear under a range of different conditions was used. The models were obtained by multiple regression analysis using lifetime (LT) data contributed by the biggest European manufacturer of vacuum cleaner motors. This included reliability data for 607 different test populations involving 3980 motors. Exploration of the data revealed that wear-out parameters behaved in accordance with the existing field theory, giving additional confidence to the models. The numerical appreciation of the wear-out parameters and the resulting conclusions will be beneficial to motor design and reliability engineers. Learned knowledge will be used for faster selection of optimal design and operational motor parameters to meet recent EU regulation 666/2013. Along with the more rapid design of the product, a reduced number of LT tests will result in significant energy savings.},
keywords = {brushes, carbon brush wear modelling, domestic appliances, dominant wear mechanism, electric motors, energy saving, field theory, motor design reliability, multiple regression analysis, vacuum cleaner motor brush life prediction, wear},
pubstate = {published},
tppubtype = {article}
}
Dietzen, Matthias; Kalinina, Olga V.; Taškova, Katerina; Kneissl, Benny; Hildebrandt, Anna-Katharina; Jaenicke, Elmar; Decker, Heinz; Lengauer, Thomas; Hildebrandt, Andreas
Large oligomeric complex structures can be computationally assembled by efficiently combining docked interfaces Journal Article
In: Proteins: Structure, Function, and Bioinformatics, vol. 83, no. 10, pp. 1887-1899, 2015.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: 3D-MOSAIC, complex match score, macromolecular assembly, protein–protein interactions, structural modeling, transformation match score
@article{https://doi.org/10.1002/prot.24873,
title = {Large oligomeric complex structures can be computationally assembled by efficiently combining docked interfaces},
author = {Matthias Dietzen and Olga V. Kalinina and Katerina Ta\v{s}kova and Benny Kneissl and Anna-Katharina Hildebrandt and Elmar Jaenicke and Heinz Decker and Thomas Lengauer and Andreas Hildebrandt},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.24873},
doi = {https://doi.org/10.1002/prot.24873},
year = {2015},
date = {2015-01-01},
journal = {Proteins: Structure, Function, and Bioinformatics},
volume = {83},
number = {10},
pages = {1887-1899},
abstract = {ABSTRACT Macromolecular oligomeric assemblies are involved in many biochemical processes of living organisms. The benefits of such assemblies in crowded cellular environments include increased reaction rates, efficient feedback regulation, cooperativity and protective functions. However, an atom-level structural determination of large assemblies is challenging due to the size of the complex and the difference in binding affinities of the involved proteins. In this study, we propose a novel combinatorial greedy algorithm for assembling large oligomeric complexes from information on the approximate position of interaction interfaces of pairs of monomers in the complex. Prior information on complex symmetry is not required but rather the symmetry is inferred during assembly. We implement an efficient geometric score, the transformation match score, that bypasses the model ranking problems of state-of-the-art scoring functions by scoring the similarity between the inferred dimers of the same monomer simultaneously with different binding partners in a (sub)complex with a set of pregenerated docking poses. We compiled a diverse benchmark set of 308 homo and heteromeric complexes containing 6 to 60 monomers. To explore the applicability of the method, we considered 48 sets of parameters and selected those three sets of parameters, for which the algorithm can correctly reconstruct the maximum number, namely 252 complexes (81.8%) in, at least one of the respective three runs. The crossvalidation coverage, that is, the mean fraction of correctly reconstructed benchmark complexes during crossvalidation, was 78.1%, which demonstrates the ability of the presented method to correctly reconstruct topology of a large variety of biological complexes. Proteins 2015; 83:1887\textendash1899. © 2015 The Authors. Proteins: Structure, Function, and Bioinformatics Published by Wiley Periodicals, Inc.},
keywords = {3D-MOSAIC, complex match score, macromolecular assembly, protein\textendashprotein interactions, structural modeling, transformation match score},
pubstate = {published},
tppubtype = {article}
}
Šilc, Jurij; Taškova, Katerina; Korošec, Peter
Data mining-assisted parameter tuning of a search algorithm Journal Article
In: Informatica, vol. 39, no. 2, 2015.
Abstract | Links | BibTeX | Tags: data mining
@article{vsilc2015data,
title = {Data mining-assisted parameter tuning of a search algorithm},
author = {Jurij \v{S}ilc and Katerina Ta\v{s}kova and Peter Koro\v{s}ec},
url = {https://informatica.si/index.php/informatica/article/view/833},
year = {2015},
date = {2015-01-01},
urldate = {2015-01-01},
journal = {Informatica},
volume = {39},
number = {2},
abstract = {The main purpose of this paper is to show how using data-mining technique to tackle the problem of tuning the performance of a meta-heuristic search algorithm with respect to its parameters. The operational behavior of typical meta-heuristic search algorithms is determined by a set of control parameters, which have to be fine-tuned in order to obtain a best performance for a given problem. The principle challenge here is how to provide meaningful settings for an algorithm, obtained as result of better insight in its behavior. In this context, we discuss the idea of learning a model of an algorithm behavior by data mining analysis of parameter tuning results. The study was conducted using the Differential Ant-Stigmergy Algorithm as an example meta-heuristic search algorithm.},
keywords = {data mining},
pubstate = {published},
tppubtype = {article}
}
2014
Tyukin, Andrey; Kramer, Stefan; Wicker, Jörg
BMaD — A Boolean Matrix Decomposition Framework Proceedings Article
In: Calders, Toon; Esposito, Floriana; Hüllermeier, Eyke; Meo, Rosa (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 481-484, Springer Berlin Heidelberg, 2014, ISBN: 978-3-662-44844-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Boolean matrix decomposition, data mining, framework
@inproceedings{tyukin2014bmad,
title = {BMaD -- A Boolean Matrix Decomposition Framework},
author = {Andrey Tyukin and Stefan Kramer and J\"{o}rg Wicker},
editor = {Toon Calders and Floriana Esposito and Eyke H\"{u}llermeier and Rosa Meo},
url = {http://dx.doi.org/10.1007/978-3-662-44845-8_40},
doi = {10.1007/978-3-662-44845-8_40},
isbn = {978-3-662-44844-1},
year = {2014},
date = {2014-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {8726},
pages = {481-484},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {Boolean matrix decomposition is a method to obtain a compressed
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.},
keywords = {Boolean matrix decomposition, data mining, framework},
pubstate = {published},
tppubtype = {inproceedings}
}
representation of a matrix with Boolean entries. We present a modular
framework that unifies several Boolean matrix decomposition algorithms, and
provide methods to evaluate their performance. The main advantages of
the framework are its modular approach and hence the flexible
combination of the steps of a Boolean matrix decomposition and the
capability of handling missing values. The framework is licensed under
the GPLv3 and can be downloaded freely at
urlhttp://projects.informatik.uni-mainz.de/bmad.
2013
Wicker, Jörg
Large Classifier Systems in Bio- and Cheminformatics PhD Thesis
Technische Universität München, 2013.
Abstract | Links | BibTeX | Tags: biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity
@phdthesis{wicker2013large,
title = {Large Classifier Systems in Bio- and Cheminformatics},
author = {J\"{o}rg Wicker},
url = {http://mediatum.ub.tum.de/node?id=1165858},
year = {2013},
date = {2013-01-01},
school = {Technische Universit\"{a}t M\"{u}nchen},
abstract = {Large classifier systems are machine learning algorithms that use multiple
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.},
keywords = {biodegradation, bioinformatics, cheminformatics, computational sustainability, data mining, enviPath, machine learning, multi-label classification, multi-relational learning, toxicity},
pubstate = {published},
tppubtype = {phdthesis}
}
classifiers to improve the prediction of target values in advanced
classification tasks. Although learning problems in bio- and
cheminformatics commonly provide data in schemes suitable for large
classifier systems, they are rarely used in these domains. This thesis
introduces two new classifiers incorporating systems of classifiers
using Boolean matrix decomposition to handle data in a schema that
often occurs in bio- and cheminformatics.
The first approach, called MLC-BMaD (multi-label classification using
Boolean matrix decomposition), uses Boolean matrix decomposition to
decompose the labels in a multi-label classification task. The
decomposed matrices are a compact representation of the information
in the labels (first matrix) and the dependencies among the labels
(second matrix). The first matrix is used in a further multi-label
classification while the second matrix is used to generate the final
matrix from the predicted values of the first matrix.
MLC-BMaD was evaluated on six standard multi-label data sets, the
experiments showed that MLC-BMaD can perform particularly well on data
sets with a high number of labels and a small number of instances and
can outperform standard multi-label algorithms.
Subsequently, MLC-BMaD is extended to a special case of
multi-relational learning, by considering the labels not as simple
labels, but instances. The algorithm, called ClassFact
(Classification factorization), uses both matrices in a multi-label
classification. Each label represents a mapping between two
instances.
Experiments on three data sets from the domain of bioinformatics show
that ClassFact can outperform the baseline method, which merges the
relations into one, on hard classification tasks.
Furthermore, large classifier systems are used on two cheminformatics
data sets, the first one is used to predict the environmental fate of
chemicals by predicting biodegradation pathways. The second is a data
set from the domain of predictive toxicology. In biodegradation
pathway prediction, I extend a knowledge-based system and incorporate
a machine learning approach to predict a probability for
biotransformation products based on the structure- and knowledge-based
predictions of products, which are based on transformation rules. The
use of multi-label classification improves the performance of the
classifiers and extends the number of transformation rules that can be
covered.
For the prediction of toxic effects of chemicals, I applied large
classifier systems to the ToxCasttexttrademark data set, which maps
toxic effects to chemicals. As the given toxic effects are not easy to
predict due to missing information and a skewed class
distribution, I introduce a filtering step in the multi-label
classification, which finds labels that are usable in multi-label
prediction and does not take the others in the
prediction into account. Experiments show
that this approach can improve upon the baseline method using binary
classification, as well as multi-label approaches using no filtering.
The presented results show that large classifier systems can play a
role in future research challenges, especially in bio- and
cheminformatics, where data sets frequently consist of more complex
structures and data can be rather small in terms of the number of
instances compared to other domains.
2012
Wicker, Jörg; Pfahringer, Bernhard; Kramer, Stefan
Multi-label Classification Using Boolean Matrix Decomposition Proceedings Article
In: Proceedings of the 27th Annual ACM Symposium on Applied Computing, pp. 179–186, ACM, 2012, ISBN: 978-1-4503-0857-1.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: associations, Boolean matrix decomposition, machine learning, multi-label classification
@inproceedings{wicker2012multi,
title = {Multi-label Classification Using Boolean Matrix Decomposition},
author = {J\"{o}rg Wicker and Bernhard Pfahringer and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10032
http://doi.acm.org/10.1145/2245276.2245311},
doi = {10.1145/2245276.2245311},
isbn = {978-1-4503-0857-1},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 27th Annual ACM Symposium on Applied Computing},
pages = {179--186},
publisher = {ACM},
series = {SAC '12},
abstract = {This paper introduces a new multi-label classifier based on Boolean matrix decomposition. Boolean matrix decomposition is used to extract, from the full label matrix, latent labels representing useful Boolean combinations of the original labels. Base level models predict latent labels, which are subsequently transformed into the actual labels by Boolean matrix multiplication with the second matrix from the decomposition. The new method is tested on six publicly available datasets with varying numbers of labels. The experimental evaluation shows that the new method works particularly well on datasets with a large number of labels and strong dependencies among them.},
keywords = {associations, Boolean matrix decomposition, machine learning, multi-label classification},
pubstate = {published},
tppubtype = {inproceedings}
}
Čerepnalkoski, Darko; Taškova, Katerina; Todorovski, Ljupčo; Atanasova, Nataša; Džeroski, Sašo
The influence of parameter fitting methods on model structure selection in automated modeling of aquatic ecosystems Journal Article
In: Ecological Modelling, vol. 245, pp. 136-165, 2012, ISSN: 0304-3800, (7th European Conference on Ecological Modelling (ECEM)).
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Aquatic ecosystems, Dynamical systems, Equation discovery, Meta-heuristic optimization, Parameter estimation, Process-based modeling
@article{CEREPNALKOSKI2012136,
title = {The influence of parameter fitting methods on model structure selection in automated modeling of aquatic ecosystems},
author = {Darko \v{C}erepnalkoski and Katerina Ta\v{s}kova and Ljup\v{c}o Todorovski and Nata\v{s}a Atanasova and Sa\v{s}o D\v{z}eroski},
url = {https://www.sciencedirect.com/science/article/pii/S0304380012002724},
doi = {https://doi.org/10.1016/j.ecolmodel.2012.06.001},
issn = {0304-3800},
year = {2012},
date = {2012-01-01},
journal = {Ecological Modelling},
volume = {245},
pages = {136-165},
abstract = {Modeling dynamical systems involves two subtasks: structure identification and parameter estimation. ProBMoT is a tool for automated modeling of dynamical systems that addresses both tasks simultaneously. It takes into account domain knowledge formalized as templates for components of the process-based models: entities and processes. Taking a conceptual model of the system, the library of domain knowledge, and measurements of a particular dynamical system, it identifies both the structure and numerical parameters of the appropriate process-based model. ProBMoT has two main components corresponding to the two subtasks of modeling. The first component is concerned with generating candidate model structures that adhere to the conceptual model specified as input. The second subsystem uses the measured data to find suitable values for the constant parameters of a given model by using parameter estimation methods. ProBMoT uses model error to rank model structures and select the one that fits measured data best. In this paper, we investigate the influence of the selection of the parameter estimation methods on the structure identification. We consider one local (derivative-based) and one global (meta-heuristic) parameter estimation method. As opposed to other comparative studies of parameter estimation methods that focus on identifying parameters of a single model structure, we compare the parameter estimation methods in the context of repetitive parameter estimation for a number of candidate model structures. The results confirm the superiority of the global optimization methods over the local ones in the context of structure identification.},
note = {7th European Conference on Ecological Modelling (ECEM)},
keywords = {Aquatic ecosystems, Dynamical systems, Equation discovery, Meta-heuristic optimization, Parameter estimation, Process-based modeling},
pubstate = {published},
tppubtype = {article}
}
Taskova, Katerina; Šilc, Jurij; Atanasova, Nataša; Džeroski, Sašo
Parameter estimation in a nonlinear dynamic model of an aquatic ecosystem with meta-heuristic optimization Journal Article
In: Ecological Modelling, vol. 226, pp. 36-61, 2012, ISSN: 0304-3800.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: Aquatic ecosystems, Least-squares estimation, Meta-heuristic optimization, Ordinary differential equations, Parameter estimation
@article{TASHKOVA201236,
title = {Parameter estimation in a nonlinear dynamic model of an aquatic ecosystem with meta-heuristic optimization},
author = {Katerina Taskova and Jurij \v{S}ilc and Nata\v{s}a Atanasova and Sa\v{s}o D\v{z}eroski},
url = {https://www.sciencedirect.com/science/article/pii/S0304380011005795},
doi = {https://doi.org/10.1016/j.ecolmodel.2011.11.029},
issn = {0304-3800},
year = {2012},
date = {2012-01-01},
urldate = {2012-01-01},
journal = {Ecological Modelling},
volume = {226},
pages = {36-61},
abstract = {Parameter estimation in dynamic models of ecosystems is essentially an optimization task. Due to the characteristics of ecosystems and typical models thereof, such as non-linearity, high dimensionality, and low quantity and quality of observed data, this optimization task can be very hard for traditional (derivative-based or local) optimization methods. This calls for the use of advanced meta-heuristic approaches, such as evolutionary or swarm-based methods. In this paper, we conduct an empirical comparison of four meta-heuristic optimization methods, and one local optimization method as a baseline, on a representative task of parameter estimation in a nonlinear dynamic model of an aquatic ecosystem. The five methods compared are the differential ant-stigmergy algorithm (DASA) and its continuous variant (CDASA), particle swarm optimization (PSO), differential evolution (DE) and algorithm 717 (A717). We use synthetic data, both without and with different levels of noise, as well as real measurements from Lake Bled. We also consider two different simulation approaches: teacher forcing, which makes supervised predictions one (small) time step ahead, and full (multistep) simulation, which makes predictions based on the history predictions for longer time periods. The meta-heuristic global optimization methods for parameter estimation are clearly superior and should be preferred over local optimization methods. While the differences in performance between the different methods within the class of meta-heuristics are not significant across all conditions, differential evolution yields the best results in terms of quality of the reconstructed system dynamics as well as speed of convergence. While the use of teacher forcing simulation makes parameter estimation much faster, the use of full simulation produces much better parameter estimates from real measured data.},
keywords = {Aquatic ecosystems, Least-squares estimation, Meta-heuristic optimization, Ordinary differential equations, Parameter estimation},
pubstate = {published},
tppubtype = {article}
}
Taškova, Katerina
Parameter Identification in Nonlinear Dynamic Systems with Meta-heuristic Approaches PhD Thesis
2012.
@phdthesis{tavskova2012parameter,
title = {Parameter Identification in Nonlinear Dynamic Systems with Meta-heuristic Approaches},
author = {Katerina Ta\v{s}kova},
year = {2012},
date = {2012-01-01},
urldate = {2012-01-01},
abstract = {The task of mathematical modeling of dynamic systems from observed system behavior,
widely known under the name of system identification, breaks down into two subtasks.
The first task, referred to as structure identification, is to specify the model structure,
i.e., the functional form of the model. In practice, the model structure is usually given by
a human domain expert and reflects prior domain knowledge: this is called knowledge-
driven identification (as opposed to data-driven identification, which is based only on
data). Structure identification plays an important role in modeling as it defines the
choice available for the selection of the “best model”.
The second task, referred to as parameter identification, aims to estimate the values of
the model parameters that define a best possible fit of the model to the measured data. It
assumes that the model structure is known and the observed system behavior is given in
the form of measured data. Accurate estimation of the model parameters is important for
describing and analyzing the behavior of the modeled system. Parameter identification
is therefore a crucial step in almost all approaches for reconstructing system dynamics
from measured data, including knowledge-driven and data-driven system identification as
well as traditional (human) and automated modeling, i.e., the automated discovery of
appropriate model structures and model parameter values by equation discovery tools.
In this dissertation, we address the task of parameter identification in dynamic mod-
els of real-life systems. The models are represented by ordinary differential equations
(ODEs), as considered in the fields of systems biology and ecological modeling. The task
is approached as a least-squares estimation problem within the frequentist framework.
The latter means that the model parameters have fixed unique values and their optimal
values are the ones that minimize a quadratic cost function, i.e., the sum of squared errors
between the model prediction and the experimentally measured data. Least-squares esti-
mation is essentially an optimization task. However, it can turn into a difficult problem
for traditional (gradient-based) optimization methods when modeling complex system dy-
namics. Therefore, it should be addressed by advanced meta-heuristic approaches, such
as evolutionary or swarm intelligence methods.
Typically, biological and ecosystem models are nonlinear and have many parameters,
the studied systems can often be only partially observed, and their measurements are
sparse and imperfect due to noise. All of these constraints can lead to identifiability
problems, i.e., the inability to uniquely identify the unknown model parameters, making
parameter estimation an even harder optimization task. Furthermore, the implicit def-
inition of the cost function requires expensive numerical ODE simulations that have to
be performed for every parameter solution investigated during the optimization process.
As a result, parameter identification is a challenging and computationally expensive step
in the process of reconstructing the structure and behavior of biological and ecological
systems.
This dissertation attempts to improve the quality of reconstructed system dynamics
by improving parameter identification. In this context, we perform a thorough empirical
evaluation of representative meta-heuristic methods on the task of estimating parameters
in two nonlinear ODE models. The considered models describe two practically rele-
x Abstract
vant and representative real-life systems, i.e., endosome maturation in endocytosis and a
food web of Lake Bled. The compared meta-heuristic methods are the differential ant-
stigmergy algorithm, the continuous differential ant-stigmergy algorithm, particle swarm
optimization, and differential evolution. As a baseline method for the experimental com-
parison, we use Algorithm 717, a gradient-based local search method essentially designed
for nonlinear least-squares estimation. Different experimental scenarios are considered to
investigate the effect of limited observability of the system dynamics, the influence of the
ODE simulation method, and the impact of the noise in the data, on the complexity of
the parameter identification task, as well as the applicability and performance of different
optimization methods in this context.
The empirical evaluation shows that the meta-heuristic global optimization methods
for parameter identification are clearly superior and should be preferred over local opti-
mization methods. While the differences in performance between the different methods
within the class of meta-heuristics are not significant across all conditions, differential
evolution yields the best results in terms of the quality of the reconstructed system dy-
namics as well as the speed of convergence. The observability of the system shows a
strong influence, where less complete observations make the optimization task much more
difficult. The results clearly indicate the importance of choosing a relevant cost function
when the modeled systems dynamics is only partially observed. While the use of a simple
one-step trapezoidal-based integrator for supervised prediction makes parameter identifi-
cation much faster, the use of a multistep variable-coefficient integrator for unsupervised
prediction produces much better parameter estimates from real-measured data.
Furthermore, we consider the problem of parameter identification within the process
of automated modeling of dynamic systems, where a large number of model structures
is considered. One major drawback of existing automated modeling approaches is the
use of local search methods for parameter identification. In this context, we investigate
the influence of parameter identification (in terms of a global and a local optimization
method) on the outcome of the automated modeling process, i.e., on what models are
selected. We consider eight tasks of automated modeling of phytoplankton dynamics in
Lake Bled from single-year data measured in eight different years. The outcome of the
experiments empirically demonstrate the benefit of estimating model parameters by global
optimization methods for the model (structure) selection process, opening the opportunity
to model long term system dynamics.
Many challenges still remain concerning the use of optimization methods for parameter
identification in dynamic systems, especially in the context of automated modeling by
equation discovery methods. Besides the need to extend our study by including additional
dynamic systems from different domains, several lines for further improvement of existing
automated modeling methods can be followed. These include the use of more appropriate
and informative cost functions, as well as more robust and faster methods for parameter
identification. Finally, explicit integration of the feedback from identifiability analysis
within the process of model selection is highly desirable.},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
widely known under the name of system identification, breaks down into two subtasks.
The first task, referred to as structure identification, is to specify the model structure,
i.e., the functional form of the model. In practice, the model structure is usually given by
a human domain expert and reflects prior domain knowledge: this is called knowledge-
driven identification (as opposed to data-driven identification, which is based only on
data). Structure identification plays an important role in modeling as it defines the
choice available for the selection of the “best model”.
The second task, referred to as parameter identification, aims to estimate the values of
the model parameters that define a best possible fit of the model to the measured data. It
assumes that the model structure is known and the observed system behavior is given in
the form of measured data. Accurate estimation of the model parameters is important for
describing and analyzing the behavior of the modeled system. Parameter identification
is therefore a crucial step in almost all approaches for reconstructing system dynamics
from measured data, including knowledge-driven and data-driven system identification as
well as traditional (human) and automated modeling, i.e., the automated discovery of
appropriate model structures and model parameter values by equation discovery tools.
In this dissertation, we address the task of parameter identification in dynamic mod-
els of real-life systems. The models are represented by ordinary differential equations
(ODEs), as considered in the fields of systems biology and ecological modeling. The task
is approached as a least-squares estimation problem within the frequentist framework.
The latter means that the model parameters have fixed unique values and their optimal
values are the ones that minimize a quadratic cost function, i.e., the sum of squared errors
between the model prediction and the experimentally measured data. Least-squares esti-
mation is essentially an optimization task. However, it can turn into a difficult problem
for traditional (gradient-based) optimization methods when modeling complex system dy-
namics. Therefore, it should be addressed by advanced meta-heuristic approaches, such
as evolutionary or swarm intelligence methods.
Typically, biological and ecosystem models are nonlinear and have many parameters,
the studied systems can often be only partially observed, and their measurements are
sparse and imperfect due to noise. All of these constraints can lead to identifiability
problems, i.e., the inability to uniquely identify the unknown model parameters, making
parameter estimation an even harder optimization task. Furthermore, the implicit def-
inition of the cost function requires expensive numerical ODE simulations that have to
be performed for every parameter solution investigated during the optimization process.
As a result, parameter identification is a challenging and computationally expensive step
in the process of reconstructing the structure and behavior of biological and ecological
systems.
This dissertation attempts to improve the quality of reconstructed system dynamics
by improving parameter identification. In this context, we perform a thorough empirical
evaluation of representative meta-heuristic methods on the task of estimating parameters
in two nonlinear ODE models. The considered models describe two practically rele-
x Abstract
vant and representative real-life systems, i.e., endosome maturation in endocytosis and a
food web of Lake Bled. The compared meta-heuristic methods are the differential ant-
stigmergy algorithm, the continuous differential ant-stigmergy algorithm, particle swarm
optimization, and differential evolution. As a baseline method for the experimental com-
parison, we use Algorithm 717, a gradient-based local search method essentially designed
for nonlinear least-squares estimation. Different experimental scenarios are considered to
investigate the effect of limited observability of the system dynamics, the influence of the
ODE simulation method, and the impact of the noise in the data, on the complexity of
the parameter identification task, as well as the applicability and performance of different
optimization methods in this context.
The empirical evaluation shows that the meta-heuristic global optimization methods
for parameter identification are clearly superior and should be preferred over local opti-
mization methods. While the differences in performance between the different methods
within the class of meta-heuristics are not significant across all conditions, differential
evolution yields the best results in terms of the quality of the reconstructed system dy-
namics as well as the speed of convergence. The observability of the system shows a
strong influence, where less complete observations make the optimization task much more
difficult. The results clearly indicate the importance of choosing a relevant cost function
when the modeled systems dynamics is only partially observed. While the use of a simple
one-step trapezoidal-based integrator for supervised prediction makes parameter identifi-
cation much faster, the use of a multistep variable-coefficient integrator for unsupervised
prediction produces much better parameter estimates from real-measured data.
Furthermore, we consider the problem of parameter identification within the process
of automated modeling of dynamic systems, where a large number of model structures
is considered. One major drawback of existing automated modeling approaches is the
use of local search methods for parameter identification. In this context, we investigate
the influence of parameter identification (in terms of a global and a local optimization
method) on the outcome of the automated modeling process, i.e., on what models are
selected. We consider eight tasks of automated modeling of phytoplankton dynamics in
Lake Bled from single-year data measured in eight different years. The outcome of the
experiments empirically demonstrate the benefit of estimating model parameters by global
optimization methods for the model (structure) selection process, opening the opportunity
to model long term system dynamics.
Many challenges still remain concerning the use of optimization methods for parameter
identification in dynamic systems, especially in the context of automated modeling by
equation discovery methods. Besides the need to extend our study by including additional
dynamic systems from different domains, several lines for further improvement of existing
automated modeling methods can be followed. These include the use of more appropriate
and informative cost functions, as well as more robust and faster methods for parameter
identification. Finally, explicit integration of the feedback from identifiability analysis
within the process of model selection is highly desirable.
2011
Taskova, Katerina; Korošec, Peter; Šilc, Jurij; Džeroski, Sašo
Parameter estimation with bio-inspired meta-heuristic optimization: modeling the dynamics of endocytosis Journal Article
In: BMC Systems Biology, vol. 5, iss. 1, pp. 1752-0509, 2011.
Links | BibTeX | Altmetric | PlumX | Tags: machine learning, Parameter estimation
@article{Taskova2011Parameter,
title = {Parameter estimation with bio-inspired meta-heuristic optimization: modeling the dynamics of endocytosis},
author = {Katerina Taskova and Peter Koro\v{s}ec and Jurij \v{S}ilc and Sa\v{s}o D\v{z}eroski},
doi = {10.1186/1752-0509-5-159},
year = {2011},
date = {2011-10-11},
journal = {BMC Systems Biology},
volume = {5},
issue = {1},
pages = {1752-0509},
keywords = {machine learning, Parameter estimation},
pubstate = {published},
tppubtype = {article}
}
Taskova, Katerina; Korošec, Peter; Šilc, Jurij
A distributed multilevel ant-colony algorithm for the multi-way graph partitioning Journal Article
In: International Journal of Bio-Inspired Computation, vol. 3, no. 5, pp. 286-296, 2011.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags:
@article{Tashkova2011distributed,
title = {A distributed multilevel ant-colony algorithm for the multi-way graph partitioning},
author = {Katerina Taskova and Peter Koro\v{s}ec and Jurij \v{S}ilc},
url = {https://www.inderscienceonline.com/doi/abs/10.1504/IJBIC.2011.042257},
doi = {10.1504/IJBIC.2011.042257},
year = {2011},
date = {2011-01-01},
urldate = {2011-01-01},
journal = {International Journal of Bio-Inspired Computation},
volume = {3},
number = {5},
pages = {286-296},
abstract = {The graph-partitioning problem arises as a fundamental problem in many important scientific and engineering applications. A variety of optimisation methods are used for solving this problem and among them the meta-heuristics outstand for its efficiency and robustness. Here, we address the performance of the distributed multilevel ant-colony algorithm (DMACA), a meta-heuristic approach for solving the multi-way graph partitioning problem, which is based on the ant-colony optimisation paradigm and is integrated with a multilevel procedure. The basic idea of the DMACA consists of parallel, independent runs enhanced with cooperation in the form of a solution exchange among the concurrent searches. The objective of the DMACA is to reduce the overall computation time, while preserving the quality of the solutions obtained by the sequential version. The experimental evaluation on a two-way and four-way partitioning with 1% and 5% imbalance confirms that with respect to the sequential version, the DMACA obtains statistically, equally good solutions at a 99% confidence level within a reduced overall computation time.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2010
Hardy, Barry; Douglas, Nicki; Helma, Christoph; Rautenberg, Micha; Jeliazkova, Nina; Jeliazkov, Vedrin; Nikolova, Ivelina; Benigni, Romualdo; Tcheremenskaia, Olga; Kramer, Stefan; Girschick, Tobias; Buchwald, Fabian; Wicker, Jörg; Karwath, Andreas; Gütlein, Martin; Maunz, Andreas; Sarimveis, Haralambos; Melagraki, Georgia; Afantitis, Antreas; Sopasakis, Pantelis; Gallagher, David; Poroikov, Vladimir; Filimonov, Dmitry; Zakharov, Alexey; Lagunin, Alexey; Gloriozova, Tatyana; Novikov, Sergey; Skvortsova, Natalia; Druzhilovsky, Dmitry; Chawla, Sunil; Ghosh, Indira; Ray, Surajit; Patel, Hitesh; Escher, Sylvia
Collaborative development of predictive toxicology applications Journal Article
In: Journal of Cheminformatics, vol. 2, no. 1, pp. 7, 2010, ISSN: 1758-2946.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity
@article{hardy2010collaborative,
title = {Collaborative development of predictive toxicology applications},
author = {Barry Hardy and Nicki Douglas and Christoph Helma and Micha Rautenberg and Nina Jeliazkova and Vedrin Jeliazkov and Ivelina Nikolova and Romualdo Benigni and Olga Tcheremenskaia and Stefan Kramer and Tobias Girschick and Fabian Buchwald and J\"{o}rg Wicker and Andreas Karwath and Martin G\"{u}tlein and Andreas Maunz and Haralambos Sarimveis and Georgia Melagraki and Antreas Afantitis and Pantelis Sopasakis and David Gallagher and Vladimir Poroikov and Dmitry Filimonov and Alexey Zakharov and Alexey Lagunin and Tatyana Gloriozova and Sergey Novikov and Natalia Skvortsova and Dmitry Druzhilovsky and Sunil Chawla and Indira Ghosh and Surajit Ray and Hitesh Patel and Sylvia Escher},
url = {http://www.jcheminf.com/content/2/1/7},
doi = {10.1186/1758-2946-2-7},
issn = {1758-2946},
year = {2010},
date = {2010-01-01},
journal = {Journal of Cheminformatics},
volume = {2},
number = {1},
pages = {7},
abstract = {OpenTox provides an interoperable, standards-based Framework for the support of predictive toxicology data management, algorithms, modelling, validation and reporting. It is relevant to satisfying the chemical safety assessment requirements of the REACH legislation as it supports access to experimental data, (Quantitative) Structure-Activity Relationship models, and toxicological information through an integrating platform that adheres to regulatory requirements and OECD validation principles. Initial research defined the essential components of the Framework including the approach to data access, schema and management, use of controlled vocabularies and ontologies, architecture, web service and communications protocols, and selection and integration of algorithms for predictive modelling. OpenTox provides end-user oriented tools to non-computational specialists, risk assessors, and toxicological experts in addition to Application Programming Interfaces (APIs) for developers of new applications. OpenTox actively supports public standards for data representation, interfaces, vocabularies and ontologies, Open Source approaches to core platform components, and community-based collaboration approaches, so as to progress system interoperability goals.The OpenTox Framework includes APIs and services for compounds, datasets, features, algorithms, models, ontologies, tasks, validation, and reporting which may be combined into multiple applications satisfying a variety of different user needs. OpenTox applications are based on a set of distributed, interoperable OpenTox API-compliant REST web services. The OpenTox approach to ontology allows for efficient mapping of complementary data coming from different datasets into a unifying structure having a shared terminology and representation.Two initial OpenTox applications are presented as an illustration of the potential impact of OpenTox for high-quality and consistent structure-activity relationship modelling of REACH-relevant endpoints: ToxPredict which predicts and reports on toxicities for endpoints for an input chemical structure, and ToxCreate which builds and validates a predictive toxicity model based on an input toxicology dataset. Because of the extensible nature of the standardised Framework design, barriers of interoperability between applications and content are removed, as the user may combine data, models and validation from multiple sources in a dependable and time-effective way.},
keywords = {cheminformatics, computational sustainability, data mining, machine learning, REST, toxicity},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach Journal Article
In: Bioinformatics, vol. 26, no. 6, pp. 814-821, 2010.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways
@article{wicker2010predicting,
title = {Predicting biodegradation products and pathways: a hybrid knowledge- and machine learning-based approach},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
url = {http://bioinformatics.oxfordjournals.org/content/26/6/814.full},
doi = {10.1093/bioinformatics/btq024},
year = {2010},
date = {2010-01-01},
journal = {Bioinformatics},
volume = {26},
number = {6},
pages = {814-821},
publisher = {Oxford University Press},
abstract = {Motivation: Current methods for the prediction of biodegradation products and pathways of organic environmental pollutants either do not take into account domain knowledge or do not provide probability estimates. In this article, we propose a hybrid knowledge- and machine learning-based approach to overcome these limitations in the context of the University of Minnesota Pathway Prediction System (UM-PPS). The proposed solution performs relative reasoning in a machine learning framework, and obtains one probability estimate for each biotransformation rule of the system. As the application of a rule then depends on a threshold for the probability estimate, the trade-off between recall (sensitivity) and precision (selectivity) can be addressed and leveraged in practice.Results: Results from leave-one-out cross-validation show that a recall and precision of ∼0.8 can be achieved for a subset of 13 transformation rules. Therefore, it is possible to optimize precision without compromising recall. We are currently integrating the results into an experimental version of the UM-PPS server.Availability: The program is freely available on the web at http://wwwkramer.in.tum.de/research/applications/biodegradation/data.Contact: kramer@in.tum.de},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {article}
}
Wicker, Jörg; Richter, Lothar; Kramer, Stefan
SINDBAD and SiQL: Overview, Applications and Future Developments Book Section
In: Džeroski, Sašo; Goethals, Bart; Panov, Panče (Ed.): Inductive Databases and Constraint-Based Data Mining, pp. 289-309, Springer New York, 2010, ISBN: 978-1-4419-7737-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@incollection{wicker2010sindbad,
title = {SINDBAD and SiQL: Overview, Applications and Future Developments},
author = {J\"{o}rg Wicker and Lothar Richter and Stefan Kramer},
editor = {Sa\v{s}o D\v{z}eroski and Bart Goethals and Pan\v{c}e Panov},
url = {http://dx.doi.org/10.1007/978-1-4419-7738-0_12},
doi = {10.1007/978-1-4419-7738-0_12},
isbn = {978-1-4419-7737-3},
year = {2010},
date = {2010-01-01},
booktitle = {Inductive Databases and Constraint-Based Data Mining},
pages = {289-309},
publisher = {Springer New York},
abstract = {The chapter gives an overview of the current state of the Sindbad system and planned extensions. Following an introduction to the system and its query language SiQL, we present application scenarios from the areas of gene expression/regulation and small molecules. Next, we describe a web service interface to Sindbad that enables new possibilities for inductive databases (distributing tasks over multiple servers, language and platform independence, …). Finally, we discuss future plans for the system, in particular, to make the system more ‘declarative’ by the use of signatures, to integrate the useful concept of mining views into the system, and to support specific pattern domains like graphs and strings.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {incollection}
}
Korošec, Peter; Taskova, Katerina; Šilc, Jury
The differential Ant-Stigmergy Algorithm for large-scale global optimization Proceedings Article
In: IEEE Congress on Evolutionary Computation, pp. 1-8, 2010.
Links | BibTeX | Altmetric | PlumX | Tags:
@inproceedings{5586201,
title = {The differential Ant-Stigmergy Algorithm for large-scale global optimization},
author = {Peter Koro\v{s}ec and Katerina Taskova and Jury \v{S}ilc},
doi = {10.1109/CEC.2010.5586201},
year = {2010},
date = {2010-01-01},
urldate = {2010-01-01},
booktitle = {IEEE Congress on Evolutionary Computation},
pages = {1-8},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Taškova, Katerina; Korošec, Peter; Šilc, Jurij
A Distributed Multilevel Ant-Colony Approach for Finite Element Mesh Decomposition Proceedings Article
In: Wyrzykowski, Roman; Dongarra, Jack; Karczewski, Konrad; Wasniewski, Jerzy (Ed.): Parallel Processing and Applied Mathematics, pp. 398–407, Springer Berlin Heidelberg, Berlin, Heidelberg, 2010, ISBN: 978-3-642-14403-5.
@inproceedings{Ta\v{s}kova2010distributed,
title = {A Distributed Multilevel Ant-Colony Approach for Finite Element Mesh Decomposition},
author = {Katerina Ta\v{s}kova and Peter Koro\v{s}ec and Jurij \v{S}ilc},
editor = {Roman Wyrzykowski and Jack Dongarra and Konrad Karczewski and Jerzy Wasniewski},
isbn = {978-3-642-14403-5},
year = {2010},
date = {2010-01-01},
urldate = {2010-01-01},
booktitle = {Parallel Processing and Applied Mathematics},
pages = {398--407},
publisher = {Springer Berlin Heidelberg},
address = {Berlin, Heidelberg},
abstract = {The k-way finite element mesh (FEM) decomposition problem is an NP-complete problem, which consists of finding a decomposition of a FEM into k balanced submeshes such that the number of cut edges is minimized. The multilevel ant-colony algorithm (MACA) is quite new and promising hybrid approach for solving different type of FEM-decomposition problems. The MACA is a swarm-based algorithm and therefore inherently suitable for parallel processing on many levels. Motivated by the good performance of the MACA and the possibility to improve it\'s performance (computational cost and/or solution quality), in this paper we discuss the results of parallelizing the MACA on largest scale (on colony level). Explicitly, we present the distributed MACA (DMACA) approach, which is based on the idea of parallel independent runs enhanced with cooperation in form of a solution exchange among the concurrent searches. Experimental evaluation of the DMACA on a larger set of benchmark FEM-decomposition problems shows that the DMACA compared to the MACA can obtain solutions of equal quality in less computational time.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2008
Wicker, Jörg; Richter, Lothar; Kessler, Kristina; Kramer, Stefan
SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Daelemans, Walter; Goethals, Bart; Morik, Katharina (Ed.): Machine Learning and Knowledge Discovery in Databases, pp. 690-694, Springer Berlin Heidelberg, 2008, ISBN: 978-3-540-87480-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbad,
title = {SINDBAD and SiQL: An Inductive Database and Query Language in the Relational Model},
author = {J\"{o}rg Wicker and Lothar Richter and Kristina Kessler and Stefan Kramer},
editor = {Walter Daelemans and Bart Goethals and Katharina Morik},
url = {http://dx.doi.org/10.1007/978-3-540-87481-2_48},
doi = {10.1007/978-3-540-87481-2_48},
isbn = {978-3-540-87480-5},
year = {2008},
date = {2008-01-01},
booktitle = {Machine Learning and Knowledge Discovery in Databases},
volume = {5212},
pages = {690-694},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {In this demonstration, we will present the concepts and an implementation of an inductive database \textendash as proposed by Imielinski and Mannila \textendash in the relational model. The goal is to support all steps of the knowledge discovery process on the basis of queries to a database system. The query language SiQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and software architecture significantly. The prototype is applied to three different data sets: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Richter, Lothar; Wicker, Jörg; Kessler, Kristina; Kramer, Stefan
An Inductive Database and Query Language in the Relational Model Proceedings Article
In: Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology, pp. 740–744, ACM, 2008, ISBN: 978-1-59593-926-5.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{richter2008inductive,
title = {An Inductive Database and Query Language in the Relational Model},
author = {Lothar Richter and J\"{o}rg Wicker and Kristina Kessler and Stefan Kramer},
url = {https://wicker.nz/nwp-acm/authorize.php?id=N10033
http://doi.acm.org/10.1145/1353343.1353440},
doi = {10.1145/1353343.1353440},
isbn = {978-1-59593-926-5},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 11th International Conference on Extending Database Technology: Advances in Database Technology},
pages = {740--744},
publisher = {ACM},
series = {EDBT '08},
abstract = {In the demonstration, we will present the concepts and an implementation of an inductive database -- as proposed by Imielinski and Mannila -- in the relational model. The goal is to support all steps of the knowledge discovery process, from pre-processing via data mining to post-processing, on the basis of queries to a database system. The query language SIQL (structured inductive query language), an SQL extension, offers query primitives for feature selection, discretization, pattern mining, clustering, instance-based learning and rule induction. A prototype system processing such queries was implemented as part of the SINDBAD (structured inductive database development) project. Key concepts of this system, among others, are the closure of operators and distances between objects. To support the analysis of multi-relational data, we incorporated multi-relational distance measures based on set distances and recursive descent. The inclusion of rule-based classification models made it necessary to extend the data model and the software architecture significantly. The prototype is applied to three different applications: gene expression analysis, gene regulation prediction and structure-activity relationships (SARs) of small molecules.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Brosdau, Christoph; Richter, Lothar; Kramer, Stefan
SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes Proceedings Article
In: Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery, 2008.
Abstract | Links | BibTeX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{wicker2008sindbadsails,
title = {SINDBAD SAILS: A Service Architecture for Inductive Learning Schemes},
author = {J\"{o}rg Wicker and Christoph Brosdau and Lothar Richter and Stefan Kramer},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/sokd/2.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the First Workshop on Third Generation Data Mining: Towards Service-Oriented Knowledge Discovery},
abstract = {The paper presents SINDBAD SAILS (Service Architecture for Inductive Learning Schemes), a Web Service interface to the inductive database SINDBAD. To the best of our knowledge, it is the first time a Web Service interface is provided for an inductive database. The combination of service-oriented architectures and inductive databases is particularly useful, as it enables distributed data mining without the need to install specialized data mining or machine learning software. Moreover, inductive queries can easily be used in almost any kind of programming language. The paper discusses the underlying concepts and explains a sample program making use of SINDBAD SAILS.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
Wicker, Jörg; Fenner, Kathrin; Ellis, Lynda; Wackett, Larry; Kramer, Stefan
Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction Proceedings Article
In: Bridewell, Will; Calders, Toon; Medeiros, Ana Karla; Kramer, Stefan; Pechenizkiy, Mykola; Todorovski, Ljupco (Ed.): Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008, 2008.
Links | BibTeX | Tags: biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways
@inproceedings{wicker2008machine,
title = {Machine Learning and Data Mining Approaches to Biodegradation Pathway Prediction},
author = {J\"{o}rg Wicker and Kathrin Fenner and Lynda Ellis and Larry Wackett and Stefan Kramer},
editor = {Will Bridewell and Toon Calders and Ana Karla Medeiros and Stefan Kramer and Mykola Pechenizkiy and Ljupco Todorovski},
url = {http://www.ecmlpkdd2008.org/files/pdf/workshops/ipm/9.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the Second International Workshop on the Induction of Process Models at ECML PKDD 2008},
keywords = {biodegradation, cheminformatics, computational sustainability, enviPath, machine learning, metabolic pathways},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Kramer, Stefan; Aufschild, Volker; Hapfelmeier, Andreas; Jarasch, Alexander; Kessler, Kristina; Reckow, Stefan; Wicker, Jörg; Richter, Lothar
Inductive Databases in the Relational Model: The Data as the Bridge Proceedings Article
In: Bonchi, Francesco; Boulicaut, Jean-François (Ed.): Knowledge Discovery in Inductive Databases, pp. 124-138, Springer Berlin Heidelberg, 2006, ISBN: 978-3-540-33292-3.
Abstract | Links | BibTeX | Altmetric | PlumX | Tags: data mining, inductive databases, machine learning, query languages
@inproceedings{kramer2006inductive,
title = {Inductive Databases in the Relational Model: The Data as the Bridge},
author = {Stefan Kramer and Volker Aufschild and Andreas Hapfelmeier and Alexander Jarasch and Kristina Kessler and Stefan Reckow and J\"{o}rg Wicker and Lothar Richter},
editor = {Francesco Bonchi and Jean-Fran\c{c}ois Boulicaut},
url = {http://dx.doi.org/10.1007/11733492_8},
doi = {10.1007/11733492_8},
isbn = {978-3-540-33292-3},
year = {2006},
date = {2006-01-01},
booktitle = {Knowledge Discovery in Inductive Databases},
volume = {3933},
pages = {124-138},
publisher = {Springer Berlin Heidelberg},
series = {Lecture Notes in Computer Science},
abstract = {We present a new and comprehensive approach to inductive databases in the relational model. The main contribution is a new inductive query language extending SQL, with the goal of supporting the whole knowledge discovery process, from pre-processing via data mining to post-processing. A prototype system supporting the query language was developed in the SINDBAD (structured inductive database development) project. Setting aside models and focusing on distance-based and instance-based methods, closure can easily be achieved. An example scenario from the area of gene expression data analysis demonstrates the power and simplicity of the concept. We hope that this preliminary work will help to bring the fundamental issues, such as the integration of various pattern domains and data mining techniques, to the attention of the inductive database community.},
keywords = {data mining, inductive databases, machine learning, query languages},
pubstate = {published},
tppubtype = {inproceedings}
}
