diff options
author | Karanraj Chauhan <chauhank@bu.edu> | 2019-09-10 17:20:45 +0200 |
---|---|---|
committer | Karanraj Chauhan <chauhank@bu.edu> | 2019-11-07 14:38:44 +0100 |
commit | ddb80d979a5783f7af9c84e7a47e25e27ed45446 (patch) | |
tree | 7c4cb27a837c4ca30e9574c40ec30cac75c9d960 /src/pybind/mgr/diskprediction_local | |
parent | mgr/diskprediction_local: Replaced old models and updated predictor. (diff) | |
download | ceph-ddb80d979a5783f7af9c84e7a47e25e27ed45446.tar.xz ceph-ddb80d979a5783f7af9c84e7a47e25e27ed45446.zip |
mgr/diskprediction_local: Updated prediction models to use only supported python packages.
Removed non-supported python packages from requirements.txt
Added scikit-learn based models, removed rgf-python based models.
Updated config.json and DiskPredictor.__preprocess for the same.
Also added manufacturer as argument to DiskPredictor.__preprocess
Updated manufacturer lookup - first check if available as smartctl field,
if not then try to infer from model name.
Updated predicted class to be the prediction for the most recent day in
time series data given.
Updated naming convention from "preprocessor" to "scaler".
Signed-off-by: Karanraj Chauhan <kachauha@redhat.com>
Diffstat (limited to 'src/pybind/mgr/diskprediction_local')
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/config.json | 3 | ||||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib | bin | 1183526 -> 0 bytes | |||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib | bin | 1055 -> 0 bytes | |||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib | bin | 1189825 -> 37062495 bytes | |||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib | bin | 1023 -> 0 bytes | |||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib | bin | 0 -> 1535 bytes | |||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/predictor.py | 161 | ||||
-rw-r--r-- | src/pybind/mgr/diskprediction_local/requirements.txt | 11 |
8 files changed, 112 insertions, 63 deletions
diff --git a/src/pybind/mgr/diskprediction_local/models/config.json b/src/pybind/mgr/diskprediction_local/models/config.json index f77cb11e06d..127d6ea4d8b 100644 --- a/src/pybind/mgr/diskprediction_local/models/config.json +++ b/src/pybind/mgr/diskprediction_local/models/config.json @@ -1,4 +1,3 @@ { -"hgst": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_2_normalized", "smart_2_raw", "smart_3_normalized", "smart_3_raw", "smart_4_normalized", "smart_4_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_8_normalized", "smart_8_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_12_normalized", "smart_12_raw", "smart_22_normalized", "smart_22_raw", "smart_192_normalized", "smart_192_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_196_normalized", "smart_196_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_199_normalized", "smart_199_raw"], -"seagate": ["user_capacity", "smart_1_normalized", "smart_1_raw", "smart_5_normalized", "smart_5_raw", "smart_7_normalized", "smart_7_raw", "smart_9_normalized", "smart_9_raw", "smart_10_normalized", "smart_10_raw", "smart_184_normalized", "smart_184_raw", "smart_187_normalized", "smart_187_raw", "smart_188_normalized", "smart_188_raw", "smart_189_normalized", "smart_189_raw", "smart_190_normalized", "smart_190_raw", "smart_193_normalized", "smart_193_raw", "smart_194_normalized", "smart_194_raw", "smart_197_normalized", "smart_197_raw", "smart_198_normalized", "smart_198_raw", "smart_240_normalized", "smart_240_raw", "smart_241_normalized", "smart_241_raw", "smart_242_normalized", "smart_242_raw"] +"seagate": ["user_capacity", "smart_1_raw", "smart_5_raw", "smart_7_raw", "smart_10_raw", "smart_187_raw", "smart_188_raw", "smart_190_raw", "smart_193_raw", "smart_197_raw", "smart_198_raw", "smart_241_raw", "smart_1_normalized", "smart_5_normalized", "smart_7_normalized", "smart_10_normalized", "smart_187_normalized", "smart_188_normalized", "smart_190_normalized", "smart_193_normalized", "smart_197_normalized", "smart_198_normalized", "smart_241_normalized"] } diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib Binary files differdeleted file mode 100644 index 9e1c51f0479..00000000000 --- a/src/pybind/mgr/diskprediction_local/models/hgst_predictor.joblib +++ /dev/null diff --git a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib Binary files differdeleted file mode 100644 index 2d94963ecd4..00000000000 --- a/src/pybind/mgr/diskprediction_local/models/hgst_preprocessor.joblib +++ /dev/null diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib Binary files differindex 574223e668b..ee7d420a2fb 100644 --- a/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib +++ b/src/pybind/mgr/diskprediction_local/models/seagate_predictor.joblib diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib Binary files differdeleted file mode 100644 index 34f96ab99d8..00000000000 --- a/src/pybind/mgr/diskprediction_local/models/seagate_preprocessor.joblib +++ /dev/null diff --git a/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib Binary files differnew file mode 100644 index 00000000000..0b769b983f4 --- /dev/null +++ b/src/pybind/mgr/diskprediction_local/models/seagate_scaler.joblib diff --git a/src/pybind/mgr/diskprediction_local/predictor.py b/src/pybind/mgr/diskprediction_local/predictor.py index 3ddd9346662..5464b2ec429 100644 --- a/src/pybind/mgr/diskprediction_local/predictor.py +++ b/src/pybind/mgr/diskprediction_local/predictor.py @@ -38,14 +38,23 @@ class DiskFailurePredictor(object): This class implements a disk failure prediction module. """ + # json with manufacturer names as keys # and features used for prediction as values CONFIG_FILE = "config.json" - PREDICTION_CLASSES = {-1: "Unknown", - 0: "Good", - 1: "Warning", - 2: "Bad"} - + PREDICTION_CLASSES = {-1: "Unknown", 0: "Good", 1: "Warning", 2: "Bad"} + + # model name prefixes to identify vendor + MANUFACTURER_MODELNAME_PREFIXES = { + "WDC": "WDC", + "Toshiba": "Toshiba", # for cases like "Toshiba xxx" + "TOSHIBA": "Toshiba", # for cases like "TOSHIBA xxx" + "toshiba": "Toshiba", # for cases like "toshiba xxx" + "S": "Seagate", # for cases like "STxxxx" and "Seagate BarraCuda ZAxxx" + "ZA": "Seagate", # for cases like "ZAxxxx" + "Hitachi": "Hitachi", + "HGST": "HGST", + } def __init__(self): """ @@ -54,7 +63,6 @@ class DiskFailurePredictor(object): self.model_dirpath = "" self.model_context = {} - def initialize(self, model_dirpath): """Initialize all models. Save paths of all trained model files to list @@ -73,18 +81,17 @@ class DiskFailurePredictor(object): self.model_context = json.load(f_conf) # ensure all manufacturers whose context is defined in config file - # have models and preprocessors saved inside model_dirpath + # have models and scalers saved inside model_dirpath for manufacturer in self.model_context: - preprocessor_path = os.path.join(model_dirpath, manufacturer + '_preprocessor.joblib') - if not os.path.isfile(preprocessor_path): - return "Missing preprocessor file: {}".format(preprocessor_path) - model_path = os.path.join(model_dirpath, manufacturer + '_predictor.joblib') + scaler_path = os.path.join(model_dirpath, manufacturer + "_scaler.joblib") + if not os.path.isfile(scaler_path): + return "Missing scaler file: {}".format(scaler_path) + model_path = os.path.join(model_dirpath, manufacturer + "_predictor.joblib") if not os.path.isfile(model_path): return "Missing model file: {}".format(model_path) self.model_dirpath = model_dirpath - def __format_raw_data(self, disk_days): """Massages the input raw data into a form that can be used by the predictor for preprocessing, feeding to model, etc. Specifically, @@ -103,53 +110,87 @@ class DiskFailurePredictor(object): df = pd.DataFrame(disk_days) # change from dict type {'bytes': 123} to just float64 type 123 - df['user_capacity'] = df['user_capacity'].apply(lambda x: x['bytes']) + df["user_capacity"] = df["user_capacity"].apply(lambda x: x["bytes"]) # change from dict type {'table': [{}, {}, {}]} to list type [{}, {}, {}] - df['ata_smart_attributes'] = df['ata_smart_attributes'].apply(lambda x: x['table']) + df["ata_smart_attributes"] = df["ata_smart_attributes"].apply( + lambda x: x["table"] + ) # make a separate column for raw and normalized values of each smart id for day_idx in range(len(disk_days)): - for attr_dict in df.iloc[0]['ata_smart_attributes']: - smart_id = attr_dict['id'] - df.at[day_idx, 'smart_{}_raw'.format(smart_id)] = int(attr_dict['raw']['value']) - df.at[day_idx, 'smart_{}_normalized'.format(smart_id)] = int(attr_dict['value']) + for attr_dict in df.iloc[0]["ata_smart_attributes"]: + smart_id = attr_dict["id"] + df.at[day_idx, "smart_{}_raw".format(smart_id)] = int( + attr_dict["raw"]["value"] + ) + df.at[day_idx, "smart_{}_normalized".format(smart_id)] = int( + attr_dict["value"] + ) # drop the now-redundant column - df = df.drop('ata_smart_attributes', axis=1) + df = df.drop("ata_smart_attributes", axis=1) return df - - def __preprocess(self, disk_days_df): + def __preprocess(self, disk_days_df, manufacturer): """Scales and transforms input dataframe to feed it to prediction model Arguments: disk_days_df {pandas.DataFrame} -- df where each row holds drive features from one day. + manufacturer {str} -- manufacturer of the hard drive Returns: numpy.ndarray -- (n, d) shaped array of n days worth of data and d features, scaled """ - # preprocessing may vary across manufactueres. so get manufacturer - manufacturer = DiskFailurePredictor.__get_manufacturer(disk_days_df['model_name'].iloc[0]).lower() - - # keep only the features used for prediction for current manufacturer + # get the attributes that were used to train model for current manufacturer try: - disk_days_df = disk_days_df[self.model_context[manufacturer]] + model_smart_attr = self.model_context[manufacturer] except KeyError as e: - # TODO: change to log.error - print("Either SMART attributes mismatch for hard drive and prediction model,\ - or 'model_name' not available in input data") - print(e) + print("No context (SMART attributes on which model has been trained) found for manufacturer: {}"\ + .format(manufacturer) + ) return None - # scale raw data - preprocessor_path = os.path.join(self.model_dirpath, manufacturer + '_preprocessor.joblib') - preprocessor = joblib.load(preprocessor_path) - disk_days_df = preprocessor.transform(disk_days_df) - return disk_days_df + # keep only the required features + try: + disk_days_df = disk_days_df[model_smart_attr] + except KeyError as e: + print("Mismatch in SMART attributes used to train model and SMART attributes available") + return None + # featurize n (6 to 12) days data - mean,std,coefficient of variation + # current model is trained on 6 days of data because that is what will be + # available at runtime + # NOTE: ensure unique indices so that features can be merged w/ pandas errors + disk_days_df = disk_days_df.reset_index(drop=True) + means = disk_days_df.drop("user_capacity", axis=1).rolling(6).mean() + stds = disk_days_df.drop("user_capacity", axis=1).rolling(6).std() + cvs = stds.divide(means, fill_value=0) + + # rename and combine features into one df + means = means.rename(columns={col: "mean_" + col for col in means.columns}) + stds = stds.rename(columns={col: "std_" + col for col in stds.columns}) + cvs = cvs.rename(columns={col: "cv_" + col for col in cvs.columns}) + featurized_df = means.merge(stds, left_index=True, right_index=True) + featurized_df = featurized_df.merge(cvs, left_index=True, right_index=True) + + # drop rows where all features (mean,std,cv) are nans + featurized_df = featurized_df.dropna(how="all") + + # fill nans created by cv calculation + featurized_df = featurized_df.fillna(0) + + # capacity is not a feature that varies over time + # FIXME: will this values roll over + featurized_df["user_capacity"] = disk_days_df["user_capacity"] + + # scale features + scaler_path = os.path.join(self.model_dirpath, manufacturer + "_scaler.joblib") + scaler = joblib.load(scaler_path) + featurized_df = scaler.transform(featurized_df) + return featurized_df @staticmethod def __get_manufacturer(model_name): @@ -161,35 +202,45 @@ class DiskFailurePredictor(object): Returns: str -- manufacturer name """ - if model_name.startswith("W"): - return "WDC" - elif model_name.startswith("T"): - return "Toshiba" - elif model_name.startswith("S"): - return "Seagate" - elif model_name.startswith("Hi"): - return "Hitachi" - else: - return "HGST" - + for prefix, manufacturer in DiskFailurePredictor.MANUFACTURER_MODELNAME_PREFIXES.items(): + if model_name.startswith(prefix): + return manufacturer + # print error message + print("Could not infer manufacturer from model name {}".format(model_name)) def predict(self, disk_days): # massage data into a format that can be fed to models raw_df = self.__format_raw_data(disk_days) - # preprocess - preprocessed_data = self.__preprocess(raw_df) + # get manufacturer preferably as a smartctl attribute + # if not available then infer using model name + try: + manufacturer = raw_df["vendor"].iloc[0] + except KeyError as e: + print('"vendor" field not found in smartctl output. Will try to infer manufacturer from model name.') + manufacturer = DiskFailurePredictor.__get_manufacturer(raw_df["model_name"].iloc[0]).lower() + + # print error message, return Unknown, and continue execution + if manufacturer is None: + print( + "Manufacturer could not be determiend. This may be because \ + DiskPredictor has never encountered this manufacturer before, \ + or the model name is not according to the manufacturer's \ + naming conventions known to DiskPredictor" + ) + return DiskFailurePredictor.PREDICTION_CLASSES[-1] + + # preprocess for feeding to model + preprocessed_data = self.__preprocess(raw_df, manufacturer) if preprocessed_data is None: return DiskFailurePredictor.PREDICTION_CLASSES[-1] # get model for current manufacturer - manufacturer = self.__get_manufacturer(raw_df['model_name'].iloc[0]).lower() - model_path = os.path.join(self.model_dirpath, manufacturer + '_predictor.joblib') + model_path = os.path.join( + self.model_dirpath, manufacturer + "_predictor.joblib" + ) model = joblib.load(model_path) - # predictions for each day - preds = model.predict(preprocessed_data) - - # use majority vote to decide class. raise if a nan prediction exists - pred_class_id = stats.mode(preds, nan_policy='raise').mode[0] + # use prediction for last day + pred_class_id = model.predict(preprocessed_data)[-1] return DiskFailurePredictor.PREDICTION_CLASSES[pred_class_id] diff --git a/src/pybind/mgr/diskprediction_local/requirements.txt b/src/pybind/mgr/diskprediction_local/requirements.txt index 8769b42e601..4bfcec0fcc9 100644 --- a/src/pybind/mgr/diskprediction_local/requirements.txt +++ b/src/pybind/mgr/diskprediction_local/requirements.txt @@ -1,6 +1,5 @@ -numpy==1.16.4 -scipy==1.2.1 -pandas==0.25.0 -joblib==0.13.2 -scikit-learn==0.21.2 -rgf-python==3.6.0 +numpy==1.15.1 +scipy==1.1.0 +pandas==0.23.4 +joblib==0.11 +scikit-learn==0.19.2 |