Spaces:
Runtime error
Runtime error
| """ | |
| The interface for data preprocessing. | |
| Authors: | |
| LogPAI Team | |
| """ | |
| import pandas as pd | |
| import os | |
| import numpy as np | |
| import re | |
| from collections import Counter | |
| from scipy.special import expit | |
| from itertools import compress | |
| class FeatureExtractor(object): | |
| def __init__(self): | |
| self.idf_vec = None | |
| self.mean_vec = None | |
| self.events = None | |
| self.term_weighting = None | |
| self.normalization = None | |
| self.oov = None | |
| def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1): | |
| """ Fit and transform the data matrix | |
| Arguments | |
| --------- | |
| X_seq: ndarray, log sequences matrix | |
| term_weighting: None or `tf-idf` | |
| normalization: None or `zero-mean` | |
| oov: bool, whether to use OOV event | |
| min_count: int, the minimal occurrence of events (default 0), only valid when oov=True. | |
| Returns | |
| ------- | |
| X_new: The transformed data matrix | |
| """ | |
| print('====== Transformed train data summary ======') | |
| self.term_weighting = term_weighting | |
| self.normalization = normalization | |
| self.oov = oov | |
| X_counts = [] | |
| for i in range(X_seq.shape[0]): | |
| event_counts = Counter(X_seq[i]) | |
| X_counts.append(event_counts) | |
| X_df = pd.DataFrame(X_counts) | |
| X_df = X_df.fillna(0) | |
| self.events = X_df.columns | |
| X = X_df.values | |
| if self.oov: | |
| oov_vec = np.zeros(X.shape[0]) | |
| if min_count > 1: | |
| idx = np.sum(X > 0, axis=0) >= min_count | |
| oov_vec = np.sum(X[:, ~idx] > 0, axis=1) | |
| X = X[:, idx] | |
| self.events = np.array(X_df.columns)[idx].tolist() | |
| X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) | |
| num_instance, num_event = X.shape | |
| if self.term_weighting == 'tf-idf': | |
| df_vec = np.sum(X > 0, axis=0) | |
| self.idf_vec = np.log(num_instance / (df_vec + 1e-8)) | |
| idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) | |
| X = idf_matrix | |
| if self.normalization == 'zero-mean': | |
| mean_vec = X.mean(axis=0) | |
| self.mean_vec = mean_vec.reshape(1, num_event) | |
| X = X - np.tile(self.mean_vec, (num_instance, 1)) | |
| elif self.normalization == 'sigmoid': | |
| X[X != 0] = expit(X[X != 0]) | |
| X_new = X | |
| print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) | |
| return X_new | |
| def transform(self, X_seq): | |
| """ Transform the data matrix with trained parameters | |
| Arguments | |
| --------- | |
| X: log sequences matrix | |
| term_weighting: None or `tf-idf` | |
| Returns | |
| ------- | |
| X_new: The transformed data matrix | |
| """ | |
| print('====== Transformed test data summary ======') | |
| X_counts = [] | |
| for i in range(X_seq.shape[0]): | |
| event_counts = Counter(X_seq[i]) | |
| X_counts.append(event_counts) | |
| X_df = pd.DataFrame(X_counts) | |
| X_df = X_df.fillna(0) | |
| empty_events = set(self.events) - set(X_df.columns) | |
| for event in empty_events: | |
| X_df[event] = [0] * len(X_df) | |
| X = X_df[self.events].values | |
| if self.oov: | |
| oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1) | |
| X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) | |
| num_instance, num_event = X.shape | |
| if self.term_weighting == 'tf-idf': | |
| idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) | |
| X = idf_matrix | |
| if self.normalization == 'zero-mean': | |
| X = X - np.tile(self.mean_vec, (num_instance, 1)) | |
| elif self.normalization == 'sigmoid': | |
| X[X != 0] = expit(X[X != 0]) | |
| X_new = X | |
| print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) | |
| return X_new, self.events | |