Source code for iforest_for_bpms

# coding: utf-8
'''
Created on May 4, 2018
Application of Isolation Forest algorithm for the diagnostics of BPM signal.
Original paper: Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. “Isolation forest.” Data Mining, 2008. ICDM‘08. Eighth IEEE International Conference.

@author: Elena Fol
'''
from __future__ import print_function
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
import pandas
import argparse
from sklearn.ensemble import IsolationForest
from utils import logging_tools
from tfs_files import tfs_pandas
from model.accelerators import lhc


LOGGER = logging_tools.get_logger(__name__)
ARCS_CONT = 0.01
IRS_CONT = 0.025
FEATURES = "TUNE{0},NOISE_SCALED,AMP{0}"
FEATURES_WITH_NAME = "NAME,TUNE{0},NOISE_SCALED,AMP{0}"
PLANE = ("x", "y")


def get_bad_bpms(files, remove_bpms):
    files_list = files.split(',')
    files_x, files_y = separate_by_plane(files_list)
    bad_dfs = {"x": [], "y": []}
    bad_bpms_to_write = {"x": None, "y": None}
    for files, plane in ((files_x, "x"), (files_y, "y")):
        bad_dfs[plane].append(get_bad_bpms_from_measurement(files, plane))
    bad_bpms_to_write["x"] = pandas.concat(bad_dfs["x"])
    bad_bpms_to_write["y"] = pandas.concat(bad_dfs["y"])
    write_bad_bpms(files_list[0], bad_bpms_to_write)
    if remove_bpms:
        remove_bpms_from_file(files_x, set(bad_bpms_to_write["x"].NAME), "x")
        remove_bpms_from_file(files_y, set(bad_bpms_to_write["y"].NAME), "y")


def write_bad_bpms(first_file, bad_bpms_to_write):
    meas_dir = os.path.abspath(os.path.join(first_file, os.pardir))
    for plane in PLANE:
        bad_bpms_summary_path = os.path.join(meas_dir, "bad_bpms_iforest_{}.tfs".format(plane))
        tfs_pandas.write_tfs(bad_bpms_summary_path, bad_bpms_to_write[plane])
    LOGGER.info("Bad BPMs summary from Isolation Forest written to: %s", meas_dir)


def get_bad_bpms_from_measurement(files, plane):
    uplane = plane.upper()
    bpm_tfs_data = _create_tfs_data(files, plane)
    arc_bpm_data, ir_bpm_data = get_data_for_clustering(bpm_tfs_data, plane)
    dataframes = []
    for data_for_clustering, cont, title in ((arc_bpm_data, ARCS_CONT, "Arcs"),
                                      (ir_bpm_data, IRS_CONT, "IRs")):
        bad_bpms, good_bpms, all_bpms_scores, bad_bpms_scores =\
             detect_anomalies(cont, data_for_clustering, uplane)
        bpm_tfs_data, data_for_clustering, bad_bpms, good_bpms =\
            [reassign_index(data)
             for data in (bpm_tfs_data, data_for_clustering, bad_bpms, good_bpms)]
        signif_feature = get_significant_features(bpm_tfs_data, data_for_clustering, bad_bpms, good_bpms, plane)
        signif_feature.loc[:, "SCORE"] = bad_bpms_scores
        dataframes.append(signif_feature)
        if plot:
            # plot_scores_threshold(all_bpms_scores, cont, title)
            plot_bpms_3d(good_bpms, bad_bpms, uplane, title, bad_bpms_scores)
#             plot_two_dim(good_bpms, bad_bpms,"TUNE", "NOISE_SCALED", uplane, title)
#             plot_two_dim(good_bpms, bad_bpms, "TUNE", "AMP", uplane, title)
#             plot_two_dim(good_bpms, bad_bpms, "AMP", "NOISE_SCALED", uplane, title)
    return pandas.concat(dataframes)


def reassign_index(data):
    data["NEW_INDEX"] = range(len(data.NAME))
    return data.set_index("NEW_INDEX")


def get_significant_features(bpm_tfs_data, data_for_clustering, bad_bpms, good_bpms, plane):
    features_df = pandas.DataFrame(index=bad_bpms.index)
    columns = FEATURES.format(plane.upper()).split(",")
    for index in bad_bpms.index:
        max_dist = max([(abs(data_for_clustering.loc[index, col] -
                             good_bpms.loc[:, col].mean()), col)
                        for col in columns])
        max_dist, sig_col = max_dist
        features_df.loc[index, "NAME"] = bad_bpms.loc[index, "NAME"]
        features_df.loc[index, "FEATURE"] = sig_col
        features_df.loc[index, "VALUE"] = bpm_tfs_data.loc[index, sig_col]
        features_df.loc[index, "AVG"] = np.mean(bpm_tfs_data.loc[good_bpms.index][sig_col])
    return features_df


def detect_anomalies(contamination, data, uplane):
    iforest = IsolationForest(n_estimators=100, max_samples='auto',
                              contamination=contamination, max_features=1.0,
                              bootstrap=False)
    features = data[FEATURES.format(uplane).split(",")]
    iforest.fit(features)
    labels = iforest.predict(features)
    bad_bpms = data.iloc[np.where(labels == -1)].copy()
    good_bpms = data.iloc[np.where(labels != -1)].copy()
    all_bpms_scores = iforest.decision_function(features)
    bad_bpms_scores = iforest.decision_function(features.iloc[np.where(labels == -1)])
    return bad_bpms, good_bpms, all_bpms_scores, bad_bpms_scores


def get_data_for_clustering(bpm_tfs_data, plane):
    columns = FEATURES.format(plane.upper()).split(",")
    arc_bpm_mask = lhc.Lhc.get_element_types_mask(bpm_tfs_data.NAME, types=["arc_bpm"])
    ir_bpm_data_for_clustering = bpm_tfs_data.iloc[~arc_bpm_mask].copy()
    arc_bpm_data_for_clustering = bpm_tfs_data.iloc[arc_bpm_mask].copy()
    for col in columns:
        ir_bpm_data_for_clustering.loc[:, col] = _normalize_parameter(ir_bpm_data_for_clustering.loc[:, col])
        arc_bpm_data_for_clustering.loc[:, col] = _normalize_parameter(arc_bpm_data_for_clustering.loc[:, col])
    return arc_bpm_data_for_clustering, ir_bpm_data_for_clustering


def _create_tfs_data(filepaths, plane):
    bpm_data_rows = []
    for filepath in filepaths:
        bpm_tfs_file = tfs_pandas.read_tfs(filepath)
        bpms_tfs_data = bpm_tfs_file[FEATURES_WITH_NAME.format(plane.upper()).split(",")]
        bpm_data_rows.append(bpms_tfs_data)
    return pandas.concat(bpm_data_rows)


def _normalize_parameter(column_data):
    return (column_data - column_data.min()) / (column_data.max() - column_data.min())


def separate_by_plane(files_list):
    files_x = []
    files_y = []
    for file_in in files_list:
        if os.path.basename(file_in).endswith("linx"):
            files_x.append(file_in)
        elif os.path.basename(file_in).endswith("liny"):
            files_y.append(file_in)
        else:
            print("Given file is not a measurement!")
    return files_x, files_y


[docs]def remove_bpms_from_file(paths, bad_bpm_names, plane): """ Writes a backup of the original .lin files (e.g .linx --> .linx.notcleaned) and removes the BPNs identified by iForest as bad. :param paths: original lin files :param bad_bpm_names: list of the names of bad BPMs identified by iForest """ for path in paths: src_dir = os.path.abspath(os.path.join(path, os.pardir)) filename = os.path.basename(path) new_filename = os.path.join(src_dir, filename + ".notcleaned") os.rename(path, new_filename) original_file_tfs = tfs_pandas.read_tfs(new_filename).set_index("NAME", drop=False) original_file_tfs = original_file_tfs.loc[~original_file_tfs.index.isin(bad_bpm_names)] pln_num = "1" if plane == "x" else "2" original_file_tfs.headers["Q{}".format(pln_num)] =\ original_file_tfs["TUNE" + plane.upper()].mean() original_file_tfs.headers["Q{}RMS".format(pln_num)] =\ np.std(original_file_tfs["TUNE" + plane.upper()]) tfs_pandas.write_tfs(path, original_file_tfs, original_file_tfs.headers)
[docs]def revert_forest_cleaning(files): """ Reverts the cleaning. The backup files are renamed back to the original names (e.g .linx.notcleaned --> .linx) :param paths: list of files, where bad bpms identified by iForest are removed """ files_list = files.split(',') for path in files_list: src_dir = os.path.abspath(os.path.join(path, os.pardir)) filename = os.path.basename(path) notcleaned_file = os.path.join(src_dir, filename + ".notcleaned") original_file_tfs = tfs_pandas.read_tfs(notcleaned_file).set_index("NAME", drop=False) os.remove(path) lin_file = os.path.join(src_dir, notcleaned_file.replace(".notcleaned","")) os.rename(notcleaned_file, lin_file) tfs_pandas.write_tfs(lin_file, original_file_tfs)
def plot_scores_threshold(scores, cont, title): threshold = stats.scoreatpercentile(scores,100*ARCS_CONT) scores_bellow_threshold = [] scores_over_threshold = [] for score in scores: if score < threshold: scores_bellow_threshold.append(score) else: scores_over_threshold.append(score) plt.hist(scores_bellow_threshold, bins=100, range=(-0.2,0.2), edgecolor='black', linewidth=2, histtype='bar', color='blue') plt.hist(scores_over_threshold, bins=100, range=(-0.2,0.2), edgecolor='black', linewidth=2, histtype='bar', color='white') plt.axvline(x=threshold, color='r', linestyle='-', linewidth=2, label='Learned Threshold') plt.text(threshold, 0, str(threshold)[:-14], color='r', fontsize=35, verticalalignment='bottom', horizontalalignment='left') plt.title(title, fontdict={'fontsize':35, 'verticalalignment':'baseline'}) plt.xlabel("Anomaly score", fontsize = 35) plt.ylabel("Number of BPMs", fontsize = 35) plt.xticks(fontsize = 35) plt.yticks(fontsize = 35) plt.legend(fontsize = 35) plt.show() def plot_bpms_3d(good_bpms, bad_bpms, plane, title, scores): columns = FEATURES_WITH_NAME.format(plane.upper()).split(",") fig = plt.figure() ax = p3.Axes3D(fig) ax.plot3D(good_bpms.loc[:, columns[1]], good_bpms.loc[:, columns[2]], good_bpms.loc[:, columns[3]], 'o', markerfacecolor="black", markeredgecolor='black', markersize=15, label = "good") ax.plot3D(bad_bpms.loc[:, columns[1]], bad_bpms.loc[:, columns[2]], bad_bpms.loc[:, columns[3]], '^', markerfacecolor="red", markeredgecolor='black', markersize=20, label = "faulty") # for index, score in zip(bad_bpms.index, scores): # ax.text(bad_bpms.loc[index, columns[1]], bad_bpms.loc[index, columns[2]], bad_bpms.loc[index, columns[3]], bad_bpms.loc[index,"NAME"] + " {" + str(score)[:-10] + "}") ax.set_xlabel('Tune', fontsize = 25, linespacing=3.2) ax.set_ylabel('Amplitude', fontsize = 25, linespacing=3.2) ax.set_zlabel('Noise', fontsize = 25, linespacing=3.2) for axis in ('x', 'y', 'z'): ax.tick_params(axis=axis, labelsize=25) plt.legend(fontsize = 25, numpoints = 1) plt.title(title, fontdict={'fontsize':25, 'verticalalignment':'baseline'}) plt.show() def plot_two_dim(good, bad, col1, col2, plane, title): label1 = "Tune" label2 = "Noise" if(col1=="AMP"): label1 = "Amplitude" col1 = col1 + plane if(col2=="AMP"): col2 = col2 + plane label2 = "Amplitude" plt.plot( good.loc[:, col1], good.loc[:, col2], 'o', markerfacecolor="black", markeredgecolor='black', markersize=10, label = "Good", ) plt.plot( bad.loc[:, col1], bad.loc[:, col2], '^', markerfacecolor="red", markeredgecolor='red', markersize=10, label = "Bad", ) plt.xlabel(label1, fontsize = 25) plt.ylabel(label2,fontsize = 25) plt.xticks(fontsize = 25) plt.yticks(fontsize = 25) plt.legend(fontsize = 25, numpoints = 1) plt.title(title) plt.show() def _parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--files", dest="files", type=str, ) parser.add_argument( "--remove_bpms", dest="remove_bpms", action="store_true", ) parser.add_argument( "--revert", dest="revert", action="store_true", ) parser.add_argument( "--plot", dest="plot", action="store_true", ) options = parser.parse_args() return options.files, options.remove_bpms, options.revert, options.plot if __name__ == '__main__': _files, _remove_bpms, _revert, plot = _parse_args() if(not _revert): get_bad_bpms(_files, _remove_bpms) else: revert_forest_cleaning(_files)