diff options
| author | ZHENG Yanqin <[email protected]> | 2023-05-25 07:37:53 +0000 |
|---|---|---|
| committer | ZHENG Yanqin <[email protected]> | 2023-05-25 07:37:53 +0000 |
| commit | e9896bd62bb29da00ec00a121374167ad91bfe47 (patch) | |
| tree | d94845574c8ef7473d0204d28b4efd4038035463 /evaluation/affiliation_bin/generics.py | |
| parent | fad9aa875c84b38cbb5a6010e104922b1eea7291 (diff) | |
| parent | 4c5734c624705449c6b21c4b2bc5554e7259fdba (diff) | |
readme
See merge request zyq/time_series_anomaly_detection!1
Diffstat (limited to 'evaluation/affiliation_bin/generics.py')
| -rw-r--r-- | evaluation/affiliation_bin/generics.py | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/evaluation/affiliation_bin/generics.py b/evaluation/affiliation_bin/generics.py new file mode 100644 index 0000000..a2f84be --- /dev/null +++ b/evaluation/affiliation_bin/generics.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from itertools import groupby +from operator import itemgetter +import math +import gzip +import glob +import os + + +def convert_vector_to_events(vector=[0, 1, 1, 0, 0, 1, 0]): + """ + Convert a binary vector (indicating 1 for the anomalous instances) + to a list of events. The events are considered as durations, + i.e. setting 1 at index i corresponds to an anomalous interval [i, i+1). + + :param vector: a list of elements belonging to {0, 1} + :return: a list of couples, each couple representing the start and stop of + each event + """ + positive_indexes = [idx for idx, val in enumerate(vector) if val > 0] + events = [] + for k, g in groupby(enumerate(positive_indexes), lambda ix: ix[0] - ix[1]): + cur_cut = list(map(itemgetter(1), g)) + events.append((cur_cut[0], cur_cut[-1])) + + # Consistent conversion in case of range anomalies (for indexes): + # A positive index i is considered as the interval [i, i+1), + # so the last index should be moved by 1 + events = [(x, y + 1) for (x, y) in events] + + return (events) + + +def infer_Trange(events_pred, events_gt): + """ + Given the list of events events_pred and events_gt, get the + smallest possible Trange corresponding to the start and stop indexes + of the whole series. + Trange will not influence the measure of distances, but will impact the + measures of probabilities. + + :param events_pred: a list of couples corresponding to predicted events + :param events_gt: a list of couples corresponding to ground truth events + :return: a couple corresponding to the smallest range containing the events + """ + if len(events_gt) == 0: + raise ValueError('The gt events should contain at least one event') + if len(events_pred) == 0: + # empty prediction, base Trange only on events_gt (which is non empty) + return (infer_Trange(events_gt, events_gt)) + + min_pred = min([x[0] for x in events_pred]) + min_gt = min([x[0] for x in events_gt]) + max_pred = max([x[1] for x in events_pred]) + max_gt = max([x[1] for x in events_gt]) + Trange = (min(min_pred, min_gt), max(max_pred, max_gt)) + return (Trange) + + +def has_point_anomalies(events): + """ + Checking whether events contain point anomalies, i.e. + events starting and stopping at the same time. + + :param events: a list of couples corresponding to predicted events + :return: True is the events have any point anomalies, False otherwise + """ + if len(events) == 0: + return (False) + return (min([x[1] - x[0] for x in events]) == 0) + + +def _sum_wo_nan(vec): + """ + Sum of elements, ignoring math.isnan ones + + :param vec: vector of floating numbers + :return: sum of the elements, ignoring math.isnan ones + """ + vec_wo_nan = [e for e in vec if not math.isnan(e)] + return (sum(vec_wo_nan)) + + +def _len_wo_nan(vec): + """ + Count of elements, ignoring math.isnan ones + + :param vec: vector of floating numbers + :return: count of the elements, ignoring math.isnan ones + """ + vec_wo_nan = [e for e in vec if not math.isnan(e)] + return (len(vec_wo_nan)) + + +def read_gz_data(filename='data/machinetemp_groundtruth.gz'): + """ + Load a file compressed with gz, such that each line of the + file is either 0 (representing a normal instance) or 1 (representing) + an anomalous instance. + :param filename: file path to the gz compressed file + :return: list of integers with either 0 or 1 + """ + with gzip.open(filename, 'rb') as f: + content = f.read().splitlines() + content = [int(x) for x in content] + return (content) + + +def read_all_as_events(): + """ + Load the files contained in the folder `data/` and convert + to events. The length of the series is kept. + The convention for the file name is: `dataset_algorithm.gz` + :return: two dictionaries: + - the first containing the list of events for each dataset and algorithm, + - the second containing the range of the series for each dataset + """ + filepaths = glob.glob('data/*.gz') + datasets = dict() + Tranges = dict() + for filepath in filepaths: + vector = read_gz_data(filepath) + events = convert_vector_to_events(vector) + # ad hoc cut for those files + cut_filepath = (os.path.split(filepath)[1]).split('_') + data_name = cut_filepath[0] + algo_name = (cut_filepath[1]).split('.')[0] + if not data_name in datasets: + datasets[data_name] = dict() + Tranges[data_name] = (0, len(vector)) + datasets[data_name][algo_name] = events + return (datasets, Tranges) + + +def f1_func(p, r): + """ + Compute the f1 function + :param p: precision numeric value + :param r: recall numeric value + :return: f1 numeric value + """ + return (2 * p * r / (p + r))
\ No newline at end of file |
