diff options
Diffstat (limited to 'preprocess')
| -rw-r--r-- | preprocess/__init__.py | 1 | ||||
| -rw-r--r-- | preprocess/standardization.py | 119 | ||||
| -rw-r--r-- | preprocess/template.py | 61 |
3 files changed, 181 insertions, 0 deletions
diff --git a/preprocess/__init__.py b/preprocess/__init__.py new file mode 100644 index 0000000..7ca1f60 --- /dev/null +++ b/preprocess/__init__.py @@ -0,0 +1 @@ +from .standardization import MyDataset
\ No newline at end of file diff --git a/preprocess/standardization.py b/preprocess/standardization.py new file mode 100644 index 0000000..9b8b2be --- /dev/null +++ b/preprocess/standardization.py @@ -0,0 +1,119 @@ +from torch.utils.data import Dataset +from torch import float32, Tensor +from numpy import array, where + + +class MyDataset(Dataset): + def __init__(self, name: str, train_path: str = None, test_path: str = None, input_size: int = 1, + output_size: int = 1, step: int = 1, mode: str = 'train', time_index: bool = True, + del_column_name: bool = True): + """ + 可以将csv文件批量转成tensor + :param name: 数据集名称。 + :param train_path: 训练数据集路径。 + :param test_path: 测试数据集路径。 + :param input_size: 输入数据长度。 + :param output_size: 输出数据长度。 + :param step: 截取数据的窗口移动间隔。 + :param mode: train或者test,用于指示使用训练集数据还是测试集数据。 + :param time_index: True为第一列是时间戳,False为不。 + :param del_column_name: 文件中第一行为列名时,使用True。 + """ + self.name = name + self.input_size = input_size + self.output_size = output_size + self.del_column_name = del_column_name + self.step = step + self.mode = mode + self.time_index = time_index + self.train_inputs, self.train_labels, self.train_outputs, self.test_inputs, self.test_labels, self.test_outputs\ + = self.parse_data(train_path, test_path) + self.train_inputs = Tensor(self.train_inputs).to(float32) if self.train_inputs is not None else None + self.train_labels = Tensor(self.train_labels).to(float32) if self.train_labels is not None else None + self.train_outputs = Tensor(self.train_outputs).to(float32) if self.train_outputs is not None else None + self.test_inputs = Tensor(self.test_inputs).to(float32) if self.test_inputs is not None else None + self.test_labels = Tensor(self.test_labels).to(float32) if self.test_labels is not None else None + self.test_outputs = Tensor(self.test_outputs).to(float32) if self.test_outputs is not None else None + + def parse_data(self, train_path: str = None, test_path: str = None): + if train_path is None and test_path is None: + raise ValueError("train_path is None and test_path is None.") + + mean = None + deviation = None + train_data_input, train_label, train_data_output = None, None, None + test_data_input, test_label, test_data_output = None, None, None + + # 读取训练集数据 + if train_path: + train_data = [] + train_label = [] + with open(train_path, 'r', encoding='utf8') as f: + if self.del_column_name is True: + data = f.readlines()[1:] + else: + data = f.readlines() + train_data.extend([list(map(float, line.strip().split(','))) for line in data]) + train_label.extend([0 for _ in data]) + train_np = array(train_data) + if self.time_index: + train_np[:, 0] = train_np[:, 0] % 86400 + mean = train_np.mean(axis=0) # 计算平均数 + deviation = train_np.std(axis=0) # 计算标准差 + deviation = where(deviation != 0, deviation, 1) + train_np = (train_np - mean) / deviation # 标准化 + train_data = train_np.tolist() + train_data_input, train_data_output, train_label = self.cut_data(train_data, train_label) + + # 读取测试集数据 + if test_path: + test_data = [] + test_label = [] + with open(test_path, 'r', encoding='utf8') as f: + if self.del_column_name is True: + data = f.readlines()[1:] + else: + data = f.readlines() + test_data.extend([list(map(float, line.strip().split(',')))[:-1] for line in data]) + test_label.extend([int(line.strip().split(',')[-1]) for line in data]) + test_np = array(test_data) + if self.time_index: + test_np[:, 0] = test_np[:, 0] % 86400 + # mean = test_np.mean(axis=0) # 计算平均数 + # deviation = test_np.std(axis=0) # 计算标准差 + # deviation = where(deviation != 0, deviation, 1) + test_np = (test_np - mean) / deviation # 标准化 + test_data = test_np.tolist() + # 自动判断是否需要反转标签。异常标签统一认为是1,当异常标签超过一半时,需反转标签 + if sum(test_label) > 0.5*len(test_label): + test_label = (1-array(test_label)).tolist() + test_data_input, test_data_output, test_label = self.cut_data(test_data, test_label) + + return train_data_input, train_label, train_data_output, test_data_input, test_label, test_data_output + + def cut_data(self, data: [[float]], label: [int]): + n = 0 + input_data, output_data, anomaly_label = [], [], [] + while n + self.input_size + self.output_size <= len(data): + input_data.append(data[n: n + self.input_size]) + output_data.append(data[n + self.input_size: n + self.input_size + self.output_size]) + anomaly_label.append([max(label[n + self.input_size: n + self.input_size + self.output_size])]) + n = n + self.step + return input_data.copy(), output_data.copy(), anomaly_label.copy() + + def __len__(self): + if self.mode == 'train': + return self.train_inputs.shape[0] + elif self.mode == 'test': + return self.test_inputs.shape[0] + + def __getitem__(self, idx): + if self.mode == 'train': + return self.train_inputs[idx], self.train_labels[idx], self.train_outputs[idx] + elif self.mode == 'test': + return self.test_inputs[idx], self.test_labels[idx], self.test_outputs[idx] + + +if __name__ == "__main__": + app = MyDataset('../dataset/SWAT/train.csv', test_path='../dataset/SWAT/test.csv', input_size=3) + print(app) diff --git a/preprocess/template.py b/preprocess/template.py new file mode 100644 index 0000000..1c4d6a5 --- /dev/null +++ b/preprocess/template.py @@ -0,0 +1,61 @@ +""" +模板文件,有自定义预处理方法可以通过编辑本文件实现数据集预处理。 +编辑完成以后,请将文件名修改,写入config.ini,并在同级目录下init文件添加本文件 +""" + + +from torch.utils.data import Dataset + + +class DataSet(Dataset): + def __init__(self, train_path: str = None, test_path: str = None, input_size: int = 1, output_size: int = 1, + step: int = 1, mode: str = 'train', del_time: bool = True, del_column_name: bool = True, + reverse_label: bool = True): + """ + 可以将csv文件批量转成tensor + + 注意:必须包含以下变量或方法。 + 变量:self.train_inputs、self.train_labels、self.train_outputs + self.test_inputs、self.test_labels、self.test_outputs、self.mode + 方法:__len__()、__getitem__() + + :param train_path: str类型。训练数据集路径。 + :param test_path: str类型。测试数据集路径。 + :param input_size: int类型。输入数据长度。 + :param output_size: int类型。输出数据长度。 + :param step: int类型。截取数据的窗口移动间隔。 + :param mode: str类型。train或者test,用于指示使用训练集数据还是测试集数据。 + :param del_time: bool类型。True为删除时间戳列,False为不删除。 + :param del_column_name: bool类型。文件中第一行为列名时,使用True。 + :param reverse_label: bool类型。转化标签,0和1互换。标签统一采用正常为0异常为1的格式,若原文件中不符和该规定,使用True。 + """ + + self.mode = mode + self.train_inputs = None # 训练时的输入数据,Tensor格式,尺寸为[N,L,D]。N表示训练数据的数量,L表示每条数据的长度(由多少个时间点组成的数据),D表示数据维度数量 + self.train_labels = None # 训练时的数据标签,Tensor格式,尺寸为[N,1]。 + self.train_outputs = None # 训练时的输出数据,Tensor格式,尺寸为[N,L,D]。 + self.test_inputs = None # 测试时的输入数据,Tensor格式,尺寸为[N,L,D]。 + self.test_labels = None # 测试时的数据标签,Tensor格式,尺寸为[N,1]。 + self.test_outputs = None # 测试时的输出数据,Tensor格式,尺寸为[N,L,D]。 + + def __len__(self): + """ + 提供数据集长度 + :return: 测试集或者训练集数据长度N + """ + if self.mode == 'train': + return self.train_inputs.shape[0] + elif self.mode == 'test': + return self.test_inputs.shape[0] + + def __getitem__(self, idx): + """ + 获取数据 + :param idx: 数据序号 + :return: 对应的输入数据、标签、输出数据 + """ + if self.mode == 'train': + return self.train_inputs[idx], self.train_labels[idx], self.train_outputs[idx] + elif self.mode == 'test': + return self.test_inputs[idx], self.test_labels[idx], self.test_outputs[idx] + |
