Plotting MSE correlation with histograms and grouping by parameter type gives wrong colors
Apparently, the renderer seems to have trouble coloring the histograms in a correct way. When using matplotlib, it is always wrong. As setup I use the following code:
from glob import glob
import hvplot.pandas
import holoviews as hv
from multiprocessing import Pool
import numpy as np
import pandas as pd
from progressbar import progressbar as pb
import xarray as xr
from scripts.Deriv_dask import Deriv_dask
from scripts.latexify import in_params_dic, physical_params, in_params_descr_dic
from scripts.latexify import in_params_grouping
def d_unnamed(df):
return df.loc[:, ~df.columns.str.contains('^Unnamed')]
def load_and_append(name_list):
all_df = None
for name in name_list:
try:
if all_df is None:
all_df = d_unnamed(pd.read_csv(name))
else:
all_df = all_df.append(d_unnamed(pd.read_csv(name)))
except:
pass
return all_df
def reduce_df(df, error_key, sens_kind="max", error_kind="max"):
# get max or mean sensitivity and max or mean error
if sens_kind == error_kind and error_kind == "sum":
return df.groupby(["Output Parameter", "Perturbed Parameter", "Ratio Type"]).sum().reset_index()
elif sens_kind == error_kind and error_kind == "max":
tmp_df = df.copy()
tmp_df["Sensitivity"] = np.abs(tmp_df["Sensitivity"])
return tmp_df.groupby(["Output Parameter", "Perturbed Parameter", "Ratio Type"]).max().reset_index()
elif sens_kind == error_kind and error_kind == "mean":
return df.groupby(["Output Parameter", "Perturbed Parameter", "Ratio Type"]).mean().reset_index()
if sens_kind == "sum":
sens_df = df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", "Sensitivity"] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).sum()
elif sens_kind == "max":
tmp_df = df.copy()
tmp_df["Sensitivity"] = np.abs(tmp_df["Sensitivity"])
sens_df = tmp_df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", "Sensitivity"] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).max()
elif sens_kind == "mean":
sens_df = df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", "Sensitivity"] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).mean()
if error_kind == "sum":
err_df = df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", error_key] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).sum()
elif error_kind == "max":
err_df = df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", error_key] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).max()
elif error_kind == "mean":
err_df = df[ ["Output Parameter", "Perturbed Parameter", "Ratio Type", error_key] ].groupby(
["Output Parameter", "Perturbed Parameter", "Ratio Type"]).mean()
return pd.merge(sens_df, err_df, how="left", left_index=True, right_index=True).reset_index()
def load_and_plot(kind, sens_kind, error_kind, plot_types, out_params, ratio_type):
if kind == "mse":
error_key = "MSE"
elif kind == "maxse":
error_key = "Max Error"
elif kind == "nozeromse":
error_key = "MSE (no zero)"
elif kind == "sum":
error_key = "Cumulative Squared Error"
elif kind == "me":
error_key = "Mean Error"
elif kind == "mae":
error_key = "Mean Absolute Error"
all_df = load_and_append(["stats_full/" + kind + "_adjusted_conv_400_0.csv", "stats_full/" + kind + "_adjusted_conv_600_0.csv",
"stats_full/" + kind + "_adjusted_conv_600_2.csv", "stats_full/" + kind + "_adjusted_conv_600_3.csv"])
in_params = np.unique(all_df["Perturbed Parameter"])
# clean up the data. Some parameters might have slipped through that
# are not used or not tracked with AD.
del_list = []
for in_p in in_params:
if ("Not used" == in_params_descr_dic[in_p]
or "Is not tracked with AD" in in_params_descr_dic[in_p]):
del_list.append(in_p)
all_df = all_df[ ~all_df["Perturbed Parameter"].isin(del_list) ]
reduced_df = reduce_df(all_df, error_key,
sens_kind=sens_kind, error_kind=error_kind)
# out_params = ["QV"]
if plot_types:
# We need to add a column 'Group' to plot it correctly
tmp_dic = {}
for in_p in in_params:
for g in in_params_grouping:
if in_p in in_params_grouping[g]:
tmp_dic[in_p] = g
break
reduced_df["Group"] = reduced_df.apply(lambda row: tmp_dic[row["Perturbed Parameter"]], axis=1)
datashade = False
alpha = 0.5
s = 6
f_limits = (-2,2)
confidence = 0.90
hist = True
plot_kind = "grid_plot"
# Dummy for plotting
mean_traj = Deriv_dask(
direc="",
parquet=False,
netcdf=True,
columns=None,
backend="bokeh",
file_ending="")
tmp_df = reduced_df.loc[reduced_df["Sensitivity"] != 0]
if ratio_type is not None:
tmp_df = reduced_df.loc[reduced_df["Ratio Type"] == ratio_type]
return tmp_df
mse_df = load_and_plot('mse', 'mean', 'mean',
True, ["QV"], "adjusted")
mse_df["Sensitivity"] = np.abs(mse_df["Sensitivity"])
mse_df["Sensitivity"] = np.log10(mse_df["Sensitivity"])
error_key = "MSE"
mse_df[error_key] = np.abs(mse_df[error_key])
mse_df[error_key] = np.log10(mse_df[error_key])
Using holoviews like this gives correct colors:
mean_traj = Deriv_dask(
direc="",
parquet=False,
netcdf=True,
columns=None,
backend="matplotlib",
file_ending="")
for group in np.unique(mse_df["Group"]):
colors[group] = mean_traj.cmap_types[group]
cmap_values.append(mean_traj.cmap_types[group])
by_col = "Group"
mse_plot = mse_df.hvplot.hist(
y="Sensitivity",
by=by_col,
alpha=1,
legend=True,
grid=True,
title="Test colors",
c=cmap_values).opts(
aspect=16/10,
fontscale=2).options(xlabel="")
mse_plot
Storing this file on disk (and reloading it) shows the false colors:
renderer = hv.Store.renderers['matplotlib'].instance(
fig='png', dpi=300)
renderer.save(mse_plot, "test")
from IPython.display import Image, display
display(Image("test.png", width=600))
Changing from matplotlib
to bokeh
gives the correct colors.