#!/usr/bin/env python3
import sys
import pprint
import numpy as np
from statistics import mean
from sklearn import preprocessing
from sklearn import svm
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from scipy.stats.mstats import gmean

# """ dataset schema """
# capability, workload_type, VM_size, VM_pdr, VM_wss, VM_wse, VM_nwse, VM_mwpp,
# VM_pmu_instr, VM_ptr, VM_cpu_util, VM_net_util, src_cpu_avail, dst_cpu_avail,
# src_mem_avail, dst_mem_avail, qemu_tt, qemu_dt, qemu_td, performance,
# used_cpu_src,used_mem_src

# """ model features (composed features are not included here) """
# capability: original: 0, auto-converge: 1, xbzrle: 2, compress: 3 x-postcopy-ram: 4
# workload_type: idle: 0, synthetic: 1, specweb: 2, oltp: 3, memcached: 4, dacapo: 5, parsec: 6, bzip: 7, mplayer: 8
# VM_size: VM memory size in MB
# VM_pdr: VM page dirty rate in MB
# VM_wss: VM working set size in MB
# VM_wse: entropy of working set (0.0 <= e <= 1.0)
# VM_nwse: entropy of non working set (0.0 <= e <= 1.0)
# VM_mwpp: modified words per page during an interval
# VM_pmu_instr: number of retired instructions per a second
# VM_ptr: page transper rate in MB
# VM_cpu_util: VM process CPU utilization (measured using linux top command and the maximum value is about 100%)
# VM_net_util: utilization of the VM network tap interface (100.0 - ifavail)
# src_cpu_avail: available CPU in % (maximum value is 400 in our environment e.g., 4-CPU)
# dst_cpu_avail: available CPU in % (maximum value is 400 in our environment e.g., 4-CPU)
# src_mem_avail: available memory in MB. (maximum value is 16384 in our environment e.g., 16GB of system memory)
# dst_mem_avail: available memory in MB. (maximum value is 16384 in our environment e.g., 16GB of system memory)

# """ prediction target metrics """
# qemu_tt: total migration time in ms
# qemu_dt: downtime in ms
# qemu_td: total transferred data in bytes
# performance: relative performance during migration
# used_cpu_src: used CPU in percent (theoretical maximum value is 400%)
# used_mem_src: used memory in MB


def build_features(features):
    """
    Build the composed features using the raw profiled values.
    """
    # VM_size: 0, VM_pdr: 1, VM_wss: 2, VM_wse: 3, VM_nwse: 4, VM_mwpp: 5,
    # VM_pmu_instr: 6, VM_ptr: 7, VM_cpu_util: 8, VM_net_util: 9,
    # src_cpu_avail: 10, dst_cpu_avail: 11,
    # src_mem_avail: 12, dst_mem_avail: 13

    VM_size = features[0]
    VM_pdr = features[1]
    VM_wss = features[2]
    VM_wse = features[3]
    VM_nwse = features[4]
    VM_mwpp = features[5]
    VM_ptr = features[7]
    VM_ptr = 125.0 if VM_ptr > 125.0 else VM_ptr
    VM_cpu_util = features[8]

    if VM_wss * ((VM_pdr / VM_ptr)**2) < VM_wss:
        RPTR = VM_wss * ((VM_pdr / VM_ptr)**2)
    else:
        RPTR = VM_wss

    VM_nwss = VM_size - VM_wss

    VM_e_wss = VM_wss * VM_wse
    VM_e_nwss = VM_nwss * VM_nwse

    THR_benefit = VM_pdr * min((VM_cpu_util / 400.0), 1.0)

    v = VM_mwpp / (4096 / 2)
    if v > 1.0:
        v = 1.0 - (v - 1.0)

    DLTC_benefit = VM_wss * v

    # append the composed features
    features.append(RPTR)
    features.append(VM_nwss)
    features.append(DLTC_benefit)
    features.append(THR_benefit)
    features.append(VM_e_wss)
    features.append(VM_e_nwss)

    return features


def load_dataset(dataset_path):
    """
    Load the SoCC live migration dataset in csv format
    """
    capabilities = []
    workload_types = []
    X = {0.0: [], 1.0: [], 2.0: [], 3.0: [], 4.0: []}
    y = {0.0: {'TT': [], 'DT': [], 'TD': [], 'PERF': [], 'CPU': [], 'MEM': []},
         1.0: {'TT': [], 'DT': [], 'TD': [], 'PERF': [], 'CPU': [], 'MEM': []},
         2.0: {'TT': [], 'DT': [], 'TD': [], 'PERF': [], 'CPU': [], 'MEM': []},
         3.0: {'TT': [], 'DT': [], 'TD': [], 'PERF': [], 'CPU': [], 'MEM': []},
         4.0: {'TT': [], 'DT': [], 'TD': [], 'PERF': [], 'CPU': [], 'MEM': []}}

    with open(dataset_path, 'r') as fp:
        rows = fp.readlines()
        schema = rows[0].strip().split(',')
        rows = rows[1:]
        rows = [[float(v) for v in row.strip().split(',')] for row in rows]

    size = 0
    for row in rows:
        capabilities.append(row[0])
        workload_types.append(row[1])
        X[row[0]].append(build_features(row[2:-6]))
        y[row[0]]['TT'].append(row[-6])
        y[row[0]]['DT'].append(row[-5])
        y[row[0]]['TD'].append(row[-4])
        y[row[0]]['PERF'].append(row[-3])
        y[row[0]]['CPU'].append(row[-2])
        y[row[0]]['MEM'].append(row[-1])
        size += 1

    return schema, capabilities, workload_types, X, y, size


def evaluate(metric, pv, tv):
    """
    Compute geometric mean absolute errors and geometric mean relative errors
    """
    # pv: predicted values
    # tv: true values
    min_val = min(tv)

    abs_err = []
    rel_err = []
    for v1, v2 in zip(pv, tv):
        if v2 == 0.0:
            continue
        if metric in ['TT', 'DT'] and v1 < 0.0:
            v1 = min_val
        abs_err.append(abs(v1 - v2))
        rel_err.append(abs(1 - (v1 / v2)))

    MAE = mean(abs_err)
    MRE = mean(rel_err)

    min_abs_err = max(abs_err)
    min_rel_err = max(rel_err)
    for v in abs_err:
        if v > 0.0 and v < min_abs_err:
            min_abs_err = v
    for v in rel_err:
        if v > 0.0 and v < min_rel_err:
            min_rel_err = v

    non_zero_abs_err = []
    non_zero_rel_err = []
    for i in range(len(abs_err)):
        v = abs_err[i]
        if v > 0.0:
            non_zero_abs_err.append(v)
        else:
            non_zero_abs_err.append(min_abs_err)

    for i in range(len(rel_err)):
        v = rel_err[i]
        if v > 0.0:
            non_zero_rel_err.append(v)
        else:
            non_zero_rel_err.append(min_rel_err)

    gMAE = gmean(non_zero_abs_err)
    gMRE = gmean(non_zero_rel_err)

    return gMAE, gMRE


def modeling(schema, capabilities, workload_types, X, y, size, model='Bagging'):
    """
    Train the live migration model and print the result in csv format.
    """
    print('capability,target_metric,gMRE,gMAE');
    translate_caps = { 0: 'PRE', 1: 'THR', 2: 'DLTC', 3: 'DTC', 4: 'POST' }
    for i in range(5):
        i = float(i)
        X_scaler = preprocessing.StandardScaler()
        X_standard = X_scaler.fit_transform(X[i])
        for j in ['TT', 'DT', 'TD', 'PERF', 'CPU', 'MEM']:
            y_scaler = preprocessing.StandardScaler()
            y_standard = y_scaler.fit_transform(np.array(y[i][j]).reshape(-1, 1))

            if model == 'Linear':
                regr = LinearRegression()
            elif model == 'SVR':
                regr = svm.SVR(C=10.0)
            elif model == 'Bagging':
                regr = BaggingRegressor(
                        svm.SVR(C=10.0), n_estimators=64, max_samples=0.9,
                        max_features=0.8)

            predicted = cross_val_predict(regr, X_standard, y_standard.ravel(),
                                          cv=10, n_jobs=10)
            predicted = y_scaler.inverse_transform(predicted)
            gMAE, gMRE = evaluate(j, predicted, y[i][j])
            print('%s,%s,%s,%.3f,%.3f' % (
                model, translate_caps[i], j, gMRE, gMAE))


if __name__ == "__main__":
    if len(sys.argv) > 1:
        path = sys.argv[1]
    else:
        path = 'dataset/2017.socc.dataset.csv'

    schema, capabilities, workload_types, X, y, size = load_dataset(path)
    modeling(schema, capabilities, workload_types, X, y, size, 'Linear')
