yano_pres/out/main.tex

\UseRawInputEncoding
%\documentclass[hyperref={pdfpagelabels=false}]{beamer}
\documentclass[hyperref={pdfpagelabels=false},aspectratio=169]{beamer}
% Die Hyperref Option hyperref={pdfpagelabels=false} verhindert die Warnung:
% Package hyperref Warning: Option `pdfpagelabels' is turned off
% (hyperref)                because \thepage is undefined. 
% Hyperref stopped early 
%

\usepackage{lmodern}
% Das Paket lmodern erspart die folgenden Warnungen:
% LaTeX Font Warning: Font shape `OT1/cmss/m/n' in size <4> not available
% (Font)              size <5> substituted on input line 22.
% LaTeX Font Warning: Size substitutions with differences
% (Font)              up to 1.0pt have occurred.
%

% Wenn \titel{\ldots} \author{\ldots} erst nach \begin{document} kommen,
% kommt folgende Warnung:
% Package hyperref Warning: Option `pdfauthor' has already been used,
% (hyperref) ... 
% Daher steht es hier vor \begin{document}

\title[yano]{pip install yano}   
\author{Simon Kluettermann} 
\date{\today} 


 \institute{ls9 tu Dortmund}


% Dadurch wird verhindert, dass die Navigationsleiste angezeigt wird.
\setbeamertemplate{navigation symbols}{}

% zusaetzlich ist das usepackage{beamerthemeshadow} eingebunden 
\usepackage{beamerthemeshadow}

\hypersetup{pdfstartview={Fit}} % fits the presentation to the window when first displayed

\usepackage{appendixnumberbeamer}
\usepackage{listings}


\usetheme{CambridgeUS}
\usepackage{ngerman}
\usecolortheme{dolphin}


%  \beamersetuncovermixins{\opaqueness<1>{25}}{\opaqueness<2$\Rightarrow${15}}
%  sorgt dafuer das die Elemente die erst noch (zukuenftig) kommen 
%  nur schwach angedeutet erscheinen 
%\beamersetuncovermixins{\opaqueness<1>{25}}{\opaqueness<2$\Rightarrow${15}}%here disabled
% klappt auch bei Tabellen, wenn teTeX verwendet wird\ldots
\renewcommand{\figurename}{}

\setbeamertemplate{footline}
{
  \leavevmode%
  \hbox{%
  \begin{beamercolorbox}[wd=.4\paperwidth,ht=2.25ex,dp=1ex,center]{author in head/foot}%
    \usebeamerfont{author in head/foot}\insertshorttitle
  \end{beamercolorbox}%
  \begin{beamercolorbox}[wd=.25\paperwidth,ht=2.25ex,dp=1ex,center]{title in head/foot}%
    \usebeamerfont{title in head/foot}\insertsection
  \end{beamercolorbox}%
  \begin{beamercolorbox}[wd=.3499\paperwidth,ht=2.25ex,dp=1ex,right]{date in head/foot}%
    \usebeamerfont{date in head/foot}\insertshortdate{}\hspace*{2em}
    \hyperlink{toc}{\insertframenumber{} / \inserttotalframenumber\hspace*{2ex}} 
  \end{beamercolorbox}}%
  \vskip0pt%
}

\usepackage[absolute,overlay]{textpos}
\usepackage{graphicx}

\newcommand{\source}[1]{\begin{textblock*}{9cm}(0.1cm,8.9cm)
    \begin{beamercolorbox}[ht=0.5cm,left]{framesource}
        \usebeamerfont{framesource}\usebeamercolor[fg!66]{framesource} Source: {#1}
    \end{beamercolorbox}
\end{textblock*}}


\begin{document}


%from file ../yano//data/000.txt
\begin{frame}[label=]
\frametitle{}
\begin{titlepage}

	\centering
	{\huge\bfseries \par}
	\vspace{2cm}
	{\LARGE\itshape Simon Kluettermann\par}
	\vspace{1.5cm}
	{\scshape\Large Master Thesis in Physics\par}
	\vspace{0.2cm}
	{\Large submitted to the \par}
	\vspace{0.2cm}
	{\scshape\Large Faculty of Mathematics Computer Science and Natural Sciences \par}
	\vspace{0.2cm}
	{\Large \par}
	\vspace{0.2cm}
	{\scshape\Large RWTH Aachen University}
	\vspace{1cm}
	
	\vfill
	{\scshape\Large Department of Physics\par}
	\vspace{0.2cm}
	{\scshape\Large Insitute for theoretical Particle Physics and Cosmology\par}
	\vspace{0.2cm}
	{ \Large\par}
	\vspace{0.2cm}
	{\Large First Referee: Prof. Dr. Michael Kraemer \par}
	{\Large Second Referee: Prof. Dr. Felix Kahlhoefer}

	\vfill

% Bottom of the page
	{\large November 2020 \par}
\end{titlepage}
\pagenumbering{roman}
\thispagestyle{empty}
\null
\newpage
\setcounter{page}{1}
\pagenumbering{arabic}
\end{frame}


%from file ../yano//data/001Problem.txt
\begin{frame}[label=Problem]
\frametitle{Problem}
\begin{itemize}

    \item Paper with Benedikt

    \item require multiple very specific datasets

\begin{itemize}

    \item many but not to many features

    \item at least some samples (for the NN)

    \item Only numerical attributes best

    \item specific quality

    \item unrelated datasets


\end{itemize}
    \item Requires you to search for many datasets and filter them


\end{itemize}
\end{frame}


%from file ../yano//data/002Students.txt
\begin{frame}[label=Students]
\frametitle{Students}
\begin{itemize}

    \item Not clear what you can use

    \item Many different formats

    \item train/test splits

    \item So for Students I just do this work and send them archives directly

    \item $\Rightarrow$Not a good solution


\end{itemize}
\end{frame}


%from file ../yano//data/003yano.txt
\begin{frame}[label=yano]
\frametitle{yano}
\begin{itemize}

    \item So I have been packaging all my scripts

    \item I had surprisingly much fun doing this

\begin{itemize}

    \item More than just standard functions

    \item A couple of weird decisions

    \item And this will likely grow further


\end{itemize}
    \item $\Rightarrow$So I would like to discuss some parts with you and maybe you even have more features you might want


\end{itemize}
\end{frame}


%from file ../yano//data/004yano.txt
\begin{frame}[label=yano]
\frametitle{yano}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Simply install it over pip

    \item Contains 187 real-World Datasets

    \item $\Rightarrow$biggest library of datasets explicitely for anomaly detection

    \item not yet happy with this

    \item especially only mostly contains numerical and nominal attributes

    \item $\Rightarrow$few categorical and no time-series attributes


\end{itemize}
\end{column}%
\hfill%
    \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../prep/04yano/a.png}
\label{fig:prep04yanoapng}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/005selector.txt
\newpage
\section{Basics}\label{sec:Basics}
%{{{for_Basics}}}

\begin{frame}[label=selector,containsverbatim]
\frametitle{selector}
\begin{lstlisting}[language=Python]
import yano
from yano.symbols import *
condition= (number_of_features>5) &
           (number_of_features<100) &
           (number_of_samples>100) &
           (number_of_samples<10000) &
           (number_of_samples>2*number_of_features) &
           ~index
print(len(condition), "Datasets found")
\end{lstlisting}

$\Rightarrow$33 Datasets found
\end{frame}


%from file ../yano//data/006selectors.txt
\begin{frame}[label=selectors]
\frametitle{selectors}
\begin{itemize}

    \item Lots of symbols like this

\begin{itemize}

    \item name

    \item number\_of\_features

    \item number\_of\_samples

    \item index (correlated datasets)


\end{itemize}
    \item Feature types

\begin{itemize}

    \item numeric

    \item nominal

    \item categorical

    \item (textual)


\end{itemize}
    \item Count based

\begin{itemize}

    \item number\_anomalies

    \item number\_normals

    \item fraction\_anomalies


\end{itemize}
    \item Specific ones

\begin{itemize}

    \item image\_based

    \item (linearly\_seperable)


\end{itemize}

\end{itemize}
\end{frame}


%from file ../yano//data/007iterating.txt
\begin{frame}[label=iterating,containsverbatim]
\frametitle{iterating}
\begin{lstlisting}[language=Python]
for dataset in condition:
    print(condition)
\end{lstlisting}

\begin{itemize}

    \item \[annthyroid\]

    \item \[breastw\]

    \item \[cardio\]

    \item \[...\]

    \item \[Housing\_low\]


\end{itemize}
\end{frame}


%from file ../yano//data/008iterating.txt
\begin{frame}[label=iterating,containsverbatim]
\frametitle{iterating}
\begin{lstlisting}[language=Python]
for dataset in condition:
    x=dataset.getx()
    y=dataset.gety()
\end{lstlisting}

\end{frame}


%from file ../yano//data/009pipeline.txt
\begin{frame}[label=pipeline,containsverbatim]
\frametitle{pipeline}
\begin{lstlisting}[language=Python]
from yano.iter import *
for dataset, x,tx,ty in pipeline(condition, 
                                 split,
                                 shuffle,
                                 normalize("minmax")):
    ...
\end{lstlisting}

\end{frame}


%from file ../yano//data/010pipeline.txt
\begin{frame}[label=pipeline]
\frametitle{pipeline}
\begin{itemize}

    \item Again there are a couple modifiers possible

\begin{itemize}

    \item nonconst$\Rightarrow$remove constant features

    \item shuffle

    \item normalize('zscore'/'minmax')

    \item cut(10)$\Rightarrow$at most 10 datasets

    \item split$\Rightarrow$train test split, all anomalies in test set

    \item crossval(5)$\Rightarrow$similar to split, but do multiple times (crossvalidation)


\end{itemize}
    \item modifiers interact with each other

    \item For example: normalize('minmax'), split

    \item $\Rightarrow$train set always below 1, but no guarantees for the test set


\end{itemize}
\end{frame}


%from file ../yano//data/011CrossValidation.txt
\begin{frame}[label=CrossValidation]
\frametitle{CrossValidation}
\begin{itemize}

    \item Learned from DMC: Crossvalidation is important

    \item Rarely found in Anomaly Detection, why?

    \item A bit more complicated (not all samples are equal), but no reason why not

    \item $\Rightarrow$So I implemented it into yano

\begin{itemize}

    \item folding only on normal data

    \item How to handle anomalies?

    \item If not folding them, cross-validation less useful

    \item if folding them, often rare anomalies even more rare

    \item $\Rightarrow$test set always 50\% anomalous

    \item $\Rightarrow$Also improves simple evaluation metrics (accuracy)


\end{itemize}
    \item Do you know a reason why Cross Validation is not common in AD?

    \item Are there Problems with the way I fold my Anomalies?


\end{itemize}
\end{frame}


%from file ../yano//data/012Logging.txt
\begin{frame}[label=Logging,containsverbatim]
\frametitle{Logging}
\begin{lstlisting}[language=Python]
from yano.logging import Logger
from pyod.models.iforest import IForest
from extended_iforest import train_extended_ifor
l=Logger({"IFor":IForest(n_estimators=100),
          "eIFor":train_extended_ifor})
for dataset, folds in pipeline(condition,
                               crossval(5),
                               normalize("minmax"),
                               shuffle):
    l.run_cross(dataset, folds)
latex=l.to_latex()
\end{lstlisting}

\end{frame}


%from file ../yano//data/013Seeding.txt
\begin{frame}[label=Seeding]
\frametitle{Seeding}
\begin{itemize}

    \item If you dont do anything, everything is seeded.

    \item Makes rerunning a Model until the performance is good quite obvious.

    \item But as every Run is seeded itself, this might induce bias.

    \item Do you think this is worth it?

    \item Are there any Problems with this?


\end{itemize}
\end{frame}


%from file ../yano//data/014.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{lll}
\hline
 Dataset                   & eIFor                         & IFor                          \\
\hline
 $pc3$                     & $\textbf{0.7231}  \pm 0.0153$ & $\textbf{0.7223}  \pm 0.0178$ \\
 $pima$                    & $\textbf{0.7405}  \pm 0.0110$ & $\textbf{0.7347}  \pm 0.0126$ \\
 $Diabetes\_present$       & $\textbf{0.7414}  \pm 0.0195$ & $\textbf{0.7344}  \pm 0.0242$ \\
 $waveform-5000$           & $\textbf{0.7687}  \pm 0.0123$ & $\textbf{0.7592}  \pm 0.0206$ \\
 $vowels$                  & $\textbf{0.7843}  \pm 0.0298$ & $\textbf{0.7753}  \pm 0.0334$ \\
 $Vowel\_0$                & $\textbf{0.8425}  \pm 0.0698$ & $0.7193 \pm 0.0817$           \\
 $Abalone\_1\_8$           & $\textbf{0.8525}  \pm 0.0263$ & $0.8452 \pm 0.0257$           \\
 $annthyroid$              & $0.8399 \pm 0.0135$           & $\textbf{0.9087}  \pm 0.0090$ \\
 $Vehicle\_van$            & $\textbf{0.8792}  \pm 0.0265$ & $\textbf{0.8697}  \pm 0.0383$ \\
 $ionosphere$              & $\textbf{0.9320}  \pm 0.0069$ & $0.9086 \pm 0.0142$           \\
 $breastw$                 & $\textbf{0.9948}  \pm 0.0031$ & $\textbf{0.9952}  \pm 0.0033$ \\
 $segment$                 & $\textbf{1.0}$                & $\textbf{0.9993}  \pm 0.0015$ \\
 $$                        & $$                            & $$                            \\
 $Average$                 & $\textbf{0.8005}$ & $\textbf{0.7957}$ \\
\hline
\end{tabular}
\end{frame}


%from file ../yano//data/015statistics.txt
\begin{frame}[label=statistics]
\frametitle{statistics}
\begin{itemize}

    \item Friedman test to see if there is a difference between models

    \item Nemenyi test to see which models are equal, mark those equal to the maximum

    \item For 2 models, Friedman not defined $\Rightarrow$ use Wilcoxon test

    \item Does this match your expectation from the table?

    \item Two models are 'equal' if their probability of being from the same distribution is $p_{b} \leq p$, what value should $p_{b} = 0.1$ have?

    \item Do I need to correct for p hacking (n experiments, so increase the difficulty for each, or is that clear from the table)


\end{itemize}
\end{frame}


%from file ../yano//data/016Extended Isolation Forests.txt
\newpage
\section{Experiments 1}\label{sec:Experiments 1}
%{{{for_Experiments 1}}}

\begin{frame}[label=Extended Isolation Forests]
\frametitle{Extended Isolation Forests}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Isolation Forests are one algorithm for AD

    \item Tries to isolate abnormal (rare) points instead of modelling normal ones

    \item Creative approach$\Rightarrow$fairly successful (3000 Citations)

    \item Many follow up papers

    \item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)

    \item Remove bias from the Isolation Forests

    \item Also claim to improve their anomaly detection quality


\end{itemize}
\end{column}%
\hfill%

\begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../imgs/ifor}
\label{fig:ifor}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


\begin{frame}[label=Extended Isolation Forests]
\frametitle{Extended Isolation Forests}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Isolation Forests are one algorithm for AD

    \item Tries to isolate abnormal (rare) points instead of modelling normal ones

    \item Creative approach$\Rightarrow$fairly successful (3000 Citations)

    \item Many follow up papers

    \item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)

    \item Remove bias from the Isolation Forests

    \item Also claim to improve their anomaly detection quality


\end{itemize}
\end{column}%
\hfill%

\begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../imgs/eifor}
\label{fig:eifor}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


\begin{frame}[label=Extended Isolation Forests]
\frametitle{Extended Isolation Forests}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Isolation Forests are one algorithm for AD

    \item Tries to isolate abnormal (rare) points instead of modelling normal ones

    \item Creative approach$\Rightarrow$fairly successful (3000 Citations)

    \item Many follow up papers

    \item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)

    \item Remove bias from the Isolation Forests

    \item Also claim to improve their anomaly detection quality


\end{itemize}
\end{column}%
\hfill%

\begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../imgs/qual}
\label{fig:qual}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/017.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{lll}
\hline
 Dataset                   & eIFor                         & IFor                          \\
\hline
 $Delft\_pump\_5x3\_noisy$ & $\textbf{0.3893}  \pm 0.0345$ & $\textbf{0.4272}  \pm 0.0680$ \\
 $vertebral$               & $\textbf{0.4260}  \pm 0.0111$ & $\textbf{0.4554}  \pm 0.0416$ \\
 $Liver\_1$                & $0.5367 \pm 0.0508$           & $\textbf{0.5474}  \pm 0.0541$ \\
 $Sonar\_mines$            & $\textbf{0.6882}  \pm 0.1264$ & $0.6189 \pm 0.1301$           \\
 $letter$                  & $\textbf{0.6756}  \pm 0.0119$ & $0.6471 \pm 0.0111$           \\
 $Glass\_building\_float$  & $\textbf{0.6480}  \pm 0.1012$ & $\textbf{0.6755}  \pm 0.1117$ \\
 $pc3$                     & $\textbf{0.7231}  \pm 0.0153$ & $\textbf{0.7223}  \pm 0.0178$ \\
 $pima$                    & $\textbf{0.7405}  \pm 0.0110$ & $\textbf{0.7347}  \pm 0.0126$ \\
 $Diabetes\_present$       & $\textbf{0.7414}  \pm 0.0195$ & $\textbf{0.7344}  \pm 0.0242$ \\
 $waveform-5000$           & $\textbf{0.7687}  \pm 0.0123$ & $\textbf{0.7592}  \pm 0.0206$ \\
 $steel-plates-fault$      & $\textbf{0.7735}  \pm 0.0351$ & $\textbf{0.7682}  \pm 0.0402$ \\
 $vowels$                  & $\textbf{0.7843}  \pm 0.0298$ & $\textbf{0.7753}  \pm 0.0334$ \\
\hline
\end{tabular}
\end{frame}


%from file ../yano//data/018.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{lll}
\hline
 Dataset                   & eIFor                         & IFor                          \\
\hline
 $Vowel\_0$                & $\textbf{0.8425}  \pm 0.0698$ & $0.7193 \pm 0.0817$           \\
 $Housing\_low$            & $\textbf{0.7807}  \pm 0.0333$ & $\textbf{0.7862}  \pm 0.0336$ \\
 $ozone-level-8hr$         & $\textbf{0.7904}  \pm 0.0207$ & $\textbf{0.7768}  \pm 0.0118$ \\
 $Spectf\_0$               & $\textbf{0.8155}  \pm 0.0255$ & $0.7535 \pm 0.0239$           \\
 $HeartC$                  & $0.7795 \pm 0.0258$           & $\textbf{0.8079}  \pm 0.0255$ \\
 $satellite$               & $\textbf{0.8125}  \pm 0.0170$ & $\textbf{0.8103}  \pm 0.0061$ \\
 $optdigits$               & $\textbf{0.8099}  \pm 0.0310$ & $\textbf{0.8142}  \pm 0.0267$ \\
 $spambase$                & $\textbf{0.8085}  \pm 0.0110$ & $\textbf{0.8202}  \pm 0.0042$ \\
 $Abalone\_1\_8$           & $\textbf{0.8525}  \pm 0.0263$ & $0.8452 \pm 0.0257$           \\
 $qsar-biodeg$             & $\textbf{0.8584}  \pm 0.0119$ & $\textbf{0.8628}  \pm 0.0135$ \\
 $annthyroid$              & $0.8399 \pm 0.0135$           & $\textbf{0.9087}  \pm 0.0090$ \\
 $Vehicle\_van$            & $\textbf{0.8792}  \pm 0.0265$ & $\textbf{0.8697}  \pm 0.0383$ \\
\hline
\end{tabular}
\end{frame}


%from file ../yano//data/019.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{lll}
\hline
 Dataset                   & eIFor                         & IFor                          \\
\hline
 $ionosphere$              & $\textbf{0.9320}  \pm 0.0069$ & $0.9086 \pm 0.0142$           \\
 $page-blocks$             & $0.9189 \pm 0.0061$           & $\textbf{0.9299}  \pm 0.0016$ \\
 $Ecoli$                   & $\textbf{0.9418}  \pm 0.0292$ & $0.9192 \pm 0.0332$           \\
 $cardio$                  & $\textbf{0.9564}  \pm 0.0043$ & $\textbf{0.9535}  \pm 0.0036$ \\
 $wbc$                     & $\textbf{0.9611}  \pm 0.0121$ & $\textbf{0.9607}  \pm 0.0107$ \\
 $pendigits$               & $\textbf{0.9641}  \pm 0.0097$ & $\textbf{0.9652}  \pm 0.0076$ \\
 $thyroid$                 & $0.9818 \pm 0.0024$           & $\textbf{0.9871}  \pm 0.0025$ \\
 $breastw$                 & $\textbf{0.9948}  \pm 0.0031$ & $\textbf{0.9952}  \pm 0.0033$ \\
 $segment$                 & $\textbf{1.0}$                & $\textbf{0.9993}  \pm 0.0015$ \\
 $$                        & $$                            & $$                            \\
 $Average$                 & $\textbf{0.8005}  \pm 0.1458$ & $\textbf{0.7957}  \pm 0.1431$ \\
\hline
\end{tabular}
\end{frame}


%from file ../yano//data/020highdim.txt
\newpage
\section{Experiments 2}\label{sec:Experiments 2}
%{{{for_Experiments 2}}}

\begin{frame}[label=highdim]
\frametitle{highdim}
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../prep/19highdim/a.png}
\label{fig:prep19highdimapng}
  \end{figure}


\end{frame}


%from file ../yano//data/021New Condition.txt
\begin{frame}[label=New Condition,containsverbatim]
\frametitle{New Condition}
\begin{lstlisting}[language=Python]
condition= (number_of_samples>200) &
           (number_of_samples<10000) &
           (number_of_features>50) &
           (number_of_features<500) &
           ~index
print(len(condition),"Datasets found")
\end{lstlisting}

$\Rightarrow$13 Datasets found
\end{frame}


%from file ../yano//data/022New Models.txt
\begin{frame}[label=New Models,containsverbatim]
\frametitle{New Models}
\begin{lstlisting}[language=Python]
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
l=Logger({"IFor":Iforest(n_estimators=100),
          "Lof":LOF(),
          "Knn": KNN()}, addfeat=True)
\end{lstlisting}

\end{frame}


%from file ../yano//data/023.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{llll}
\hline
 Dataset                       & Knn                           & Lof                           & IFor                          \\
\hline
 $Delft\_pump\_5x3\_noisy(64)$ & $0.3800 \pm 0.0475$           & $0.3462 \pm 0.0327$           & $\textbf{0.4272}  \pm 0.0680$ \\
 $hill-valley(100)$            & $0.4744 \pm 0.0269$           & $\textbf{0.5060}  \pm 0.0327$ & $0.4720 \pm 0.0288$           \\
 $speech(400)$                 & $0.4903 \pm 0.0103$           & $\textbf{0.5104}  \pm 0.0115$ & $0.4872 \pm 0.0184$           \\
 $Sonar\_mines(60)$            & $\textbf{0.7284}  \pm 0.0939$ & $0.6769 \pm 0.0933$           & $0.6189 \pm 0.1301$           \\
 $ozone-level-8hr(72)$         & $\textbf{0.8051}  \pm 0.0288$ & $0.7738 \pm 0.0292$           & $\textbf{0.7768}  \pm 0.0118$ \\
 $spambase(57)$                & $0.8038 \pm 0.0125$           & $0.7712 \pm 0.0055$           & $\textbf{0.8202}  \pm 0.0042$ \\
 $arrhythmia(274)$             & $\textbf{0.8137}  \pm 0.0185$ & $0.8042 \pm 0.0186$           & $\textbf{0.8086}  \pm 0.0099$ \\
 $mnist(100)$                  & $0.9345 \pm 0.0039$           & $\textbf{0.9548}  \pm 0.0037$ & $0.8732 \pm 0.0069$           \\
 $Concordia3\_32(256)$         & $0.9246 \pm 0.0107$           & $\textbf{0.9486}  \pm 0.0099$ & $\textbf{0.9322}  \pm 0.0178$ \\
 $optdigits(64)$               & $0.9966 \pm 0.0012$           & $\textbf{0.9975}  \pm 0.0012$ & $0.8142 \pm 0.0267$           \\
 $gas-drift(128)$              & $\textbf{0.9790}  \pm 0.0018$ & $0.9585 \pm 0.0055$           & $0.8764 \pm 0.0166$           \\
 $Delft\_pump\_AR(160)$        & $\textbf{0.9965}$             & $\textbf{0.9953}  \pm 0.0019$ & $0.9665 \pm 0.0096$           \\
 $musk(166)$                   & $\textbf{1.0}$                & $\textbf{1.0}$                & $0.9808 \pm 0.0117$           \\
 $$                            & $$                            & $$                            & $$                            \\
 $Average$                     & $\textbf{0.7944}$ & $\textbf{0.7879}$ & $0.7580$           \\
\hline
\end{tabular}
\end{frame}


%from file ../yano//data/024.txt
\begin{frame}[label=,containsverbatim]
\frametitle{}
\begin{itemize}

    \item Hypothesis: Isolation Forests are better when there are numerical and nominal attributes

    \item Easy to test


\end{itemize}
\begin{lstlisting}[language=Python]
condition=condition & (numeric & nominal)
\end{lstlisting}

\end{frame}


%from file ../yano//data/025.txt
\begin{frame}[label=]
\frametitle{}
\begin{tabular}{llll}
\hline
 Dataset               & Knn                           & IFor                          & Lof                           \\
\hline
 $ozone-level-8hr(72)$ & $\textbf{0.8051}  \pm 0.0288$ & $\textbf{0.7768}  \pm 0.0118$ & $0.7738 \pm 0.0292$           \\
 $spambase(57)$        & $0.8038 \pm 0.0125$           & $\textbf{0.8202}  \pm 0.0042$ & $0.7712 \pm 0.0055$           \\
 $arrhythmia(274)$     & $\textbf{0.8137}  \pm 0.0185$ & $\textbf{0.8086}  \pm 0.0099$ & $0.8042 \pm 0.0186$           \\
 $musk(166)$           & $\textbf{1.0}$                & $0.9808 \pm 0.0117$           & $\textbf{1.0}$                \\
 $$                    & $$                            & $$                            & $$                            \\
 $Average$             & $\textbf{0.8556}$ & $\textbf{0.8466}$ & $\textbf{0.8373}$ \\
\hline
\end{tabular}
\begin{itemize}

    \item Only 4 datasets, so not clear at all

    \item $\Rightarrow$More datasets


\end{itemize}
\end{frame}


%from file ../yano//data/026Unsupervised Optimization.txt
\newpage
\section{Experiments 3}\label{sec:Experiments 3}
%{{{for_Experiments 3}}}

\begin{frame}[label=Unsupervised Optimization]
\frametitle{Unsupervised Optimization}
\begin{itemize}

    \item There are analysis that are only possible with many datasets

    \item Here: unsupervised optimization

    \item Given multiple AD models, find which is best:

    \item Use AUC score? Requires Anomalies$\Rightarrow$Overfitting

    \item Can you find an unsupervised Method?

    \item In general very complicated, so here only focus on very small differences in the model.

    \item So each model is an autoencoder, trained on the same dataset, where the difference is only in the initialisation


\end{itemize}
\end{frame}


%from file ../yano//data/027Loss Optimization.txt
\begin{frame}[label=Loss Optimization]
\frametitle{Loss Optimization}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item First guess Loss of the Model on the training Data

    \item How to evaluate this?

    \item Train many models, look at the average AUC score.

    \item For the alternative, take groups of 20 models, and look at the AUC score of the best model.

    \item Is there a meaningfull difference between results? Give result as z\_score ($\frac{m_{1} - m_{2}}{\sqrt{s_{1}^{2} + s_{2}^{2}}}$)

    \item This difference depends a lot on the dataset

    \item $\Rightarrow$even $30 \leq z$ does not mean much


\end{itemize}
\end{column}%
\hfill%
        \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../imgs/histone_page-blocks}
\label{fig:histone_page-blocks}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


\begin{frame}[label=Loss Optimization]
\frametitle{Loss Optimization}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item First guess Loss of the Model on the training Data

    \item How to evaluate this?

    \item Train many models, look at the average AUC score.

    \item For the alternative, take groups of 20 models, and look at the AUC score of the best model.

    \item Is there a meaningfull difference between results? Give result as z\_score ($\frac{m_{1} - m_{2}}{\sqrt{s_{1}^{2} + s_{2}^{2}}}$)

    \item This difference depends a lot on the dataset

    \item $\Rightarrow$even $30 \leq z$ does not mean much


\end{itemize}
\end{column}%
\hfill%
        \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../imgs/histone_pima}
\label{fig:histone_pima}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/028loss.txt
\begin{frame}[label=loss]
\frametitle{loss}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Pick the Model with the lowest l2\-loss


\end{itemize}
\end{column}%
\hfill%
    \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../prep/27loss/z_loss.pdf}
\label{fig:prep27lossz_losspdf}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/029Robustness.txt
\begin{frame}[label=Robustness]
\frametitle{Robustness}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Pick points with 1\% width difference in input space around each point.

    \item for each point, find the maximum difference in output space.

    \item average this difference


\end{itemize}
\end{column}%
\hfill%
    \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../prep/28Robustness/z_robu.pdf}
\label{fig:prep28Robustnessz_robupdf}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/030Distance Correlation.txt
\begin{frame}[label=Distance Correlation]
\frametitle{Distance Correlation}
\begin{columns}[c] % align columns
\begin{column}{0.48\textwidth}%.48
\begin{itemize}

    \item Pick random points in the input space.

    \item measure the distance in input and output space

    \item a low correlation is a good model


\end{itemize}
\end{column}%
\hfill%
    \begin{column}{0.48\textwidth}%.48
\begin{figure}[H] 
  \centering
\includegraphics[width=0.9\textwidth]{../prep/29Distance_Correlation/z_dist.pdf}
\label{fig:prep29Distance_Correlationz_distpdf}
  \end{figure}


\end{column}%
\hfill%
\end{columns}

\end{frame}


%from file ../yano//data/031Other.txt
\newpage
\section{Conclusion}\label{sec:Conclusion}
%{{{for_Conclusion}}}

\begin{frame}[label=Other]
\frametitle{Other}
\begin{itemize}

    \item Things I still want to add:

\begin{itemize}

    \item Ensemble Methods

    \item Visualisation options

    \item Alternative Evaluations

    \item Hyperparameter optimisation (with crossvalidation)

    \item Parallelisation

    \item Contamination

    \item Dokumentation


\end{itemize}

\end{itemize}
\end{frame}


%from file ../yano//data/032Feedback.txt
\begin{frame}[label=Feedback]
\frametitle{Feedback}
\begin{itemize}

    \item What do you think about this?

    \item Is there something I should also add?

    \item What would you need for you to actually use this?


\end{itemize}
\end{frame}


%from folder ../yano//data/Forests.txt


%from folder ../yano//data/Isolation


%from folder ../yano//data/Optimization.txt


\end{document}