All my functions (backup 2021-01-03)

#!/usr/bin/env python
# coding: utf-8

# In[1]:


# import external modules
import h5py
import time
import math
import cv2
import os
import os.path
import shutil
import pylatex
import random

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import scipy.constants as constants
import seaborn as sn
import pandas as pd

from IPython.core.display import display, HTML
from shutil import copyfile
from os import path
from datetime import datetime # for saving figures and files with current data/time in file name
from scipy.interpolate import griddata
from mpl_toolkits.axes_grid.inset_locator import (inset_axes, InsetPosition, mark_inset) # create tiny plot inside of another plot
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.colorbar import colorbar
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from matplotlib.patches import Patch
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator, AutoLocator)
from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from scipy.optimize import curve_fit
from silx.io.dictdump import dicttoh5 # save computationally expensive dictionaries as h5 files
from PIL import Image, ImageOps

get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


# change width of Jupyter notebook
def displaywidth(width):
    display(HTML("<style>.container { width:" + str(width) + "% !important; }</style>"))
displaywidth(100)


# In[2]:


def load_h5(scan, datatype, show_time=False):
    '''This function loads only a few selected h5 files, but the definition should suffice for all the files I might need throughout the work on my bachelor's thesis.'''

    try:
        if datatype == "positions":
            fpath = "/asap3/petra3/gpfs/p06/2018/data/11005475/processed/0016_EMPA_MSH_003c/scan_00" + str(scan) + "/positions.h5"
            h5_file = h5py.File(fpath, "r")
        else:
            fpath = "/asap3/petra3/gpfs/p06/2018/data/11005475/processed/0016_EMPA_MSH_003c/scan_00" + str(scan) + "/data/" + datatype + ".h5"
            h5_file = h5py.File(fpath, "r")
        return h5_file
    except:
        print("You entered '" + str(scan) + "' for the argument 'scan'. The function only takes an integer as first argument. Its value must lie between 178 and 192, except for 184, because this scan was aborted.\nThe second argument must be a string for the datatype, e.g. 'positions' or 'fluo_roi'.")


# In[3]:


def counts2amps(scan, nA=True, solar_cell_channel_only=False, select_channel=None):
    '''This function takes the integer scan number as an argument, looks for the counter data inside of the h5 file,
    converts all the counts into XBIC in (nA) and returns the values as an ndarray.'''

    # load counts and turn it into 2d array
    if scan in [179,187,188,189,190]: # XBIC measurements without lock-in amplification
        loaded_counts = load_h5(scan, "counter")["/data/solar_cell_0-10V"]
        counts_as_array = np.array(loaded_counts)
        print("This is the direct signal, because the chopper wasn't used for this scan!")
    else:
        loaded_counts = load_h5(scan, "counter")["/data/lock_in_R_0-10V"]
        counts_as_array = np.array(loaded_counts).astype(float) # this integer array must be converted to float, otherwise np.reciprocal will return only zeros

    if solar_cell_channel_only:
        if scan in [178,179,180,181,183,185,188,191,192]:
            loaded_counts = load_h5(scan, "counter")["/data/solar_cell_0-10V"]
            counts_as_array = np.array(loaded_counts)
        if scan in [182,186]:
            loaded_counts = load_h5(scan, "counter")["/data/solar_cell_-10-0V"]
            counts_as_array = np.array(loaded_counts)*(-1)
        if scan in [187,189,190]:
            counts_as_array = np.zeros(40200)

    ##### for plotting all channels ####
    if select_channel == "negative" and solar_cell_channel_only:
        loaded_counts = load_h5(scan, "counter")["/data/solar_cell_-10-0V"]
        counts_as_array = np.array(loaded_counts)*(-1)
    if select_channel == "positive" and solar_cell_channel_only:
        loaded_counts = load_h5(scan, "counter")["/data/solar_cell_0-10V"]
        counts_as_array = np.array(loaded_counts)
    if select_channel == "negative" and not solar_cell_channel_only:
        loaded_counts = load_h5(scan, "counter")["/data/lock-in_-10-0V"]
        counts_as_array = np.array(loaded_counts)
    if select_channel == "positive" and not solar_cell_channel_only:
        loaded_counts = load_h5(scan, "counter")["/data/lock_in_0-10V"]
        counts_as_array = np.array(loaded_counts)
    if select_channel == "R" and not solar_cell_channel_only:
        pass
    ####################################

    # load photon counts
    loaded_photon_counts = load_h5(scan, "counter")["/data/qbpm_micro_sum"]
    photon_counts_as_array = np.array(loaded_photon_counts).astype(float) # this integer array must be converted to float, otherwise np.reciprocal will return only zeros

    # load exposure times and turn it into 2d array
    loaded_exposure_times = load_h5(scan, "counter")["/data/exposure_time"]
    exposure_times_as_array = np.array(loaded_exposure_times)

    # normalize counts and photons to one second
    c_rate = (counts_as_array/exposure_times_as_array)
    ph_rate = photon_counts_as_array/exposure_times_as_array

    # normalize counts to the average photon rate and one second
    f_DAQ = c_rate * np.mean(ph_rate)/ph_rate # multiply each LIA count by the ratio of the average photon count to the respective photon count (e.g. twice as much photons as the average should be normalized to half of the XBIC curent)

    # calculate the conversion factor
    if scan in [178,179,180,181,182,183,185,186,191,192]: # these scans have a sensitivity of 1 μA/V
        A_PA = 10**6 # "P_A" = 1/sensitivity: Preamplifier amplification factor in V/A, values taken from LogbuchStrahlzeitPetraIII31.10.-6.11.2018.pptx, slide 54f.
    if scan in [187,188,189,190]: # these scans have a sensitivity of 100 mA/V
        A_PA = 10
    A_LIA = 1 # "A_LIA" = 1/scaling = Lock-in amplifier amplification factor in V/V, values also taken from the pptx logbook
    V2f = 10**6
    Wff = math.sqrt(2) # the output signal (in this case a sine signal) of the LIA is the RMS amplitude, but we are interested in the peak-to-peak value (see jove paper, p.4)
    H_conv = 2*Wff/(V2f*A_LIA*A_PA)

    # finally multiply the conversion factor with the normalized count of each pixel
    if not nA:
        S_XBIC = f_DAQ * H_conv
    S_XBIC = f_DAQ * H_conv * 10**9

#     return S_XBIC_raw_nA
    return S_XBIC


# In[462]:


def concat_images(image_paths, size, shape=None, return_only=False, savepng=False):
    # Open images and resize them
    width, height = size
    images = map(Image.open, image_paths)
    images = [ImageOps.fit(image, size, Image.ANTIALIAS)
              for image in images]

    # Create canvas for the final image with total size
    shape = shape if shape else (1, len(images))
    image_size = (width * shape[1], height * shape[0])
    image = Image.new('RGB', image_size)
    # Paste images into final image
    for row in range(shape[0]):
        for col in range(shape[1]):
            offset = width * col, height * row
            idx = row * shape[1] + col
            image.paste(images[idx], offset)
    print(1,type(image))

    if return_only:
        return image
    print(2,type(image))
    if savepng:
        # Create and save image grid
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        print(3,type(image))
        image = concat_images(image_array, (400, 400), shape=(5,9))
        print(4,type(image))
        image.save(f"image_{dt_string}.png", "PNG")

# Get list of image paths
folder = "/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan/savefig/plotXBIC_singlecell_new"
image_paths = [os.path.join(folder, f)
               for f in sorted(os.listdir(folder)) if f.endswith('.png')]
image_array = image_paths
concat_images(image_array, (400, 400), shape=(5,9),savepng=True)


# In[4]:


def supergrid(yvalues, zvalues, currents, gridres):
    """This function creates a rectangular grid where both axes have the given length. The given datapoints, which
    location is characterized by a tuple of y- and z-values, are interpolated such that every point in the grid gets
    assigned a value according to the nearest neighbor method.

    ARGUMENTS:
        yvalues (1D-array):           Corresponding y-values for every measurement
        zvalues (1D-array):           Corresponding x-values for every measurement
        currents (1D-array):          All currents measured on the respective y- and z-coordinates
        gridres (float):              Resolution of the grid insofar as it is the length of both axes of the grid in pixels

    RETURNS:
        (yvalues_new, zvalues_new, currents_supergrid) (Tuple of three 2D-Arrays): three grids with corresponding
        y-values, z-values and currents at every single point in the grid.
    """

    if np.size(yvalues) != np.size(zvalues):
        raise Exception("Arrays 'yvalues' and 'zvalues' must have same size.")
    else:
        if np.size(yvalues) - np.size(currents) == -1:     # This paragraph is to catch an error that is raised when
            currents = currents[:np.size(currents) - 1]    # dealing with the XRF scan XBIC data. It appears that
        if np.size(yvalues) - np.size(currents) == 1:      # the y- and z-arrays or the currents are too long by one
            yvalues = yvalues[:np.size(yvalues) - 1]       # element where no measurement was done. This element is
            zvalues = zvalues[:np.size(zvalues) - 1]       # removed from the respective array to align points and data.
        if np.size(yvalues) - np.size(currents) == 0:
            pass
        else:
            raise Exception("Positions (y, z) and corresponding data points can't be aligned due to their different size.")

    yvalues_shift = yvalues - yvalues.min() # shift y-values to obtain the relative position
    zvalues_shift = zvalues - zvalues.min() # shift y-values to obtain the relative position

    # Create grid of y- and z-values with respect to the selected grid resolution (gridres)
    yvalues_new, zvalues_new = np.mgrid[0:yvalues_shift.max():gridres * 1j, 0:zvalues_shift.max():gridres * 1j]

    # Interpolate given currents such that they fit the grid of x- and y-values
    currents_supergrid = griddata((yvalues_shift, zvalues_shift), currents, (yvalues_new, zvalues_new), method = "nearest")

    return yvalues_new, zvalues_new, currents_supergrid.T


# In[5]:


def plotXBIC_loop(scan, fig, fontsize, voltage_dic, loop_count, compareB=None, compareB_counter=None, compare=False):

    #fig.suptitle("Reduced LIA count normalized", fontsize=2*fontsize, x=0.7, y=0.92) # optional title for the figure (with this x and y it is centered and has a good distance)

    # load position and define y, z and f(y,z) as arrays that can be interpreted by the imshow() function
    positions = load_h5(scan, "positions")
    y_axis_h5 = positions["/data/encoder_fast/data"]
    z_axis_h5 = positions["/data/encoder_slow/data"]
    y = np.array(y_axis_h5)
    z = np.array(z_axis_h5)
    current = counts2amps(scan)
    if compare:
        counts2amps_A = current
        counts2amps_B = counts2amps(compareB[compareB_counter])

        # append NaN: scan 180 has the shape (40199,) and thus cannot be broadcast together with the other scans which have shapes of (40200,)
        if scan == 180:
            counts2amps_A = np.append(current,np.nan)
        if compareB[compareB_counter] == 180:
            counts2amps_B = np.append(counts2amps(compareB[compareB_counter]),np.nan)
        current = counts2amps_A-counts2amps_B

    y_grid, z_grid, current_array = supergrid(y,z,current,1000)

    # axes for new subplot
    ax = fig.add_subplot(4, 4, loop_count)

    # spacing to other subplots
    plt.subplots_adjust(right=1.3, hspace=0.5, wspace=0.5)

    # y and z labels
    ax.set_xlabel("Position Y [μm]", labelpad = 0, fontsize=fontsize)
    ax.set_ylabel("Position Z [μm]", labelpad = 0, fontsize=fontsize)

    # title
    bias_voltage = voltage_dic[scan]
    if compare:
        plt.title("Scan #" + str(scan) + " minus Scan #" + str(compareB[compareB_counter]), pad = 15, fontsize=round(9/8*fontsize))
    else:
        plt.title("Scan #" + str(scan) + bias_voltage, pad = 15, fontsize=round(9/8*fontsize))

    # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
    ax.xaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
    ax.xaxis.set_ticks([0,500,999]) # if 1000 instead of 999 it won't display
    ax.yaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
    ax.yaxis.set_ticks([0,500,999])
    ax.xaxis.set_minor_locator(MultipleLocator(50))   # Minor ticks are set to multiples of 50
    ax.yaxis.set_minor_locator(MultipleLocator(50))

    current_array = cv2.GaussianBlur(current_array, (33, 33), 0) # This accounts for the beam profile with FWHM of 105 x 108 nm.

    # plot the data
    im = ax.imshow(current_array, cmap=cm.afmhot, origin='lower')

    # add a colorbar as legend
    ax_divider = make_axes_locatable(ax) # Introduce a divider next to the axis already in place
    cax = ax_divider.append_axes("right", size = "15%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
    legend = colorbar(im, cax = cax) # Introduce a legend (colorbar) where the new axis has been placed
    cax.set_ylabel(r"$\frac{XBIC}{pixel \cdot photons_{avg}}$" + " (nA)", rotation = -90, labelpad = 50 , fontsize=5/4*fontsize)  # Place this label next to the colorbar; labelpad denotes the distance of the colorbar
    cax.yaxis.set_major_locator(AutoLocator())
    cax.yaxis.set_minor_locator(AutoMinorLocator())


# In[6]:


def plotXBIC(plot_range=None, singleplots=None, compareA=None, compareB=None, figsize=(30,30), savepng=False, dpi=None):
    '''
    This function plots the XBIC, using data from the previously defined counts2amps() function. Look at the examples to see how to use it:

    plotXBIC(plot_range=14) will do the following:
    It will plot 14 datasets, starting with #178 and ending with #192 (and omitting 184 which is still counted).

    plotXBIC(singleplots=[178,185,192]) will do the following:
    It will plot #178, #185 and #192.

    plotXBIC(compareA=[178,178,185], compareB=[187,189,189]) will do the following:
    It will plot
    Scan 178 minus Scan 187
    Scan 178 minus Scan 189
    Scan 185 minus Scan 189.
    '''

    start = time.time() # measure time for each plot

    # the best looking fontsize is about half of the width of the figsize
    fontsize = figsize[0]//2

    # individualize titles for each scan if there is a bias voltage applied (information from LogbuchStrahlzeitPetraIII31.10.-6.11.2018.pptx, slide 54f., corresponding to handwritten logbook)
    voltage_dic = {
        178: "",
        179: "",
        180: "",
        181: "",
        182: " (+ 100 mV)",
        183: " (- 100 mV)",
        185: " (- 200 mV)",
        186: " (+ 200 mV)",
        187: " (+ 500 mV)",
        188: "(- 500 mV)",
        189: " (- 1 V)",
        190: " (+ 600 mv)",
        191: "",
        192: ""
    }

    # create a figure for all plots
    fig = plt.figure(figsize=figsize)

    loop_count = 1  # the subplot number should be independent of omitted scans (e.g. scan #184); otherwise you would see an empty space at that position

    if type(plot_range) is int:
        if plot_range > 6:
            plot_range += 1 # scan #184 is omitted, but still counted by the for-loop
        for scan in range(178, 178 + plot_range):

            if scan == 184:
                continue # scan 184 was aborted

            plotXBIC_loop(scan=scan,fig=fig,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count)

            loop_count += 1

        plt.show()

    if type(singleplots) is list:
        for scan in singleplots:

            if scan == 184:
                continue # scan 184 was aborted

            plotXBIC_loop(scan=scan,fig=fig,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count)

            loop_count += 1

        plt.show()

    if type(compareA) is list:
        compareB_counter=0
        for scan in compareA:

            plotXBIC_loop(scan=scan,fig=fig,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count,compare=True,compareB=compareB,compareB_counter=compareB_counter)

            loop_count += 1
            compareB_counter += 1

        plt.show()

    end = time.time()
    if type(compareA) is list:
        print("Scan " + str(scan) + " minus Scan " + str(compareB[compareB_counter]) + " took " + str(round(end-start,2)) + " seconds to plot.")
    else:
        print("Scan " + str(scan) + " took " + str(round(end-start,2)) + " seconds to plot.")

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/plotXBIC_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/plotXBIC_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")


# In[7]:


def ImageRegistration(scan1,scan2,cell=None,printdetails=False):

    positions = load_h5(scan1, "positions")
    y_axis_h5 = positions["/data/encoder_fast/data"]
    z_axis_h5 = positions["/data/encoder_slow/data"]
    y = np.array(y_axis_h5)
    z = np.array(z_axis_h5)
    current1 = counts2amps(scan1)
    current2 = counts2amps(scan2)


    y_grid1, z_grid1, current1_array_original = supergrid(y,z,current1,1000)
    y_grid2, z_grid2, current2_array_original = supergrid(y,z,current2,1000)

    # convert current1_array to a grayscale image format (values between 0 and 255)
    current1_array_8bitgrayscale = current1_array_original/current1_array_original.max()
    current1_array_8bitgrayscale *= (1/1-current1_array_8bitgrayscale.min())
    current1_array_8bitgrayscale += 1 - current1_array_8bitgrayscale.max()
    current1_array_8bitgrayscale[current1_array_8bitgrayscale < 0] = 0
    current1_array_8bitgrayscale = np.array(current1_array_8bitgrayscale*255, dtype = np.uint8)

    current2_array_8bitgrayscale = current2_array_original/current2_array_original.max()
    current2_array_8bitgrayscale *= (1/1-current2_array_8bitgrayscale.min())
    current2_array_8bitgrayscale += 1 - current2_array_8bitgrayscale.max()
    current2_array_8bitgrayscale[current2_array_8bitgrayscale < 0] = 0
    current2_array_8bitgrayscale = np.array(current2_array_8bitgrayscale*255, dtype = np.uint8)

    shape = np.shape(current1_array_8bitgrayscale)

    warp_mode = cv2.MOTION_TRANSLATION  # we only need translation in direction of y and z (no rotation etc.)

    warp_matrix = np.eye(2, 3, dtype=np.float32)  # this is the matrix that the results of the ECC algorithm are stored in

    number_of_iterations = 5000  # Specify the number of iterations
    termination_eps = 1e-10  # Specify the threshold of the increment in the correlation coefficient between two iterations

    # Specify the overall criteria
    criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, number_of_iterations,  termination_eps)

    (cc, warp_matrix) = cv2.findTransformECC(current1_array_8bitgrayscale, current2_array_8bitgrayscale, warp_matrix, warp_mode, criteria, None, gaussFiltSize=33)

    if printdetails == True:
        scan2_aligned = cv2.warpAffine(current2_array_8bitgrayscale, warp_matrix, (shape[1], shape[0]), flags = cv2.INTER_LINEAR + cv2.WARP_INVERSE_MAP, borderValue = float("NaN")).astype(np.float32)
        scan2_aligned[scan2_aligned == 0] = float("NaN")

        # show where the values of the registered image lie (XBIC as a function of the 40200 datapoints)
        scan2_aligned_ravel = scan2_aligned.ravel()
        x = np.arange(np.shape(scan2_aligned_ravel)[0])
        plt.figure(figsize=(50,5))
        plt.scatter(x, scan2_aligned_ravel)
        plt.show()

        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (15, 15), constrained_layout = True)

        im1 = ax1.imshow(current1_array_8bitgrayscale, origin = "lower", cmap = cm.afmhot)
        ax1.set_title("Scan #" + str(scan1))

        ax1_divider = make_axes_locatable(ax1) # Introduce a divider next to the axis already in place
        cax1 = ax1_divider.append_axes("right", size = "15%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
        legend = colorbar(im1, cax = cax1) # Introduce a legend (colorbar) where the new axis has been placed
        cax1.yaxis.set_major_locator(AutoLocator())
        cax1.yaxis.set_minor_locator(AutoMinorLocator())

        im2 = ax2.imshow(current2_array_8bitgrayscale, origin = "lower", cmap = cm.afmhot)
        ax2.set_title("Scan #" + str(scan2))

        ax2_divider = make_axes_locatable(ax2) # Introduce a divider next to the axis already in place
        cax2 = ax2_divider.append_axes("right", size = "15%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
        legend = colorbar(im2, cax = cax2) # Introduce a legend (colorbar) where the new axis has been placed
        cax2.yaxis.set_major_locator(AutoLocator())
        cax2.yaxis.set_minor_locator(AutoMinorLocator())


        im3 = ax3.imshow(scan2_aligned, origin = "lower", cmap = "inferno")
        ax3.set_title("Scan #" + str(scan2) + " aligned")
        ax3.set_ylim(bottom = 0)

        ax3_divider = make_axes_locatable(ax3) # Introduce a divider next to the axis already in place
        cax3 = ax3_divider.append_axes("right", size = "15%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
        legend = colorbar(im3, cax = cax3) # Introduce a legend (colorbar) where the new axis has been placed
        cax3.yaxis.set_major_locator(AutoLocator())
        cax3.yaxis.set_minor_locator(AutoMinorLocator())

        plt.tight_layout(w_pad=7)

    return warp_matrix


# In[8]:


# plotXBIC_loop2 differs from plotXBIC_loop in that it translates the current array according to Image Registration (axes ticks correspondingly and AutoMinorLocator instead of MultipleLocator)

def plotXBIC_loop2(scan, fig, fontsize, voltage_dic, loop_count, grid, singleplot, figsize, crop=True, image_registration=True, gaussian_blur=True, select_channel=None, solar_cell_channel_only=False, plot_any=None, compareB=None, compareB_counter=None, compare=False):


#     fig.suptitle("Reduced LIA count normalized", fontsize=2*fontsize, x=0.7, y=0.92) # optional title for the figure (with this x and y it is centered and has a good distance)

    # load position and define y, z and f(y,z) as arrays that can be interpreted by the imshow() function
    positions = load_h5(scan, "positions")
    y_axis_h5 = positions["/data/encoder_fast/data"]
    z_axis_h5 = positions["/data/encoder_slow/data"]
    y = np.array(y_axis_h5)
    z = np.array(z_axis_h5)

    scan_order = [185,183,181,182,186]
    if crop and scan in scan_order and image_registration and gaussian_blur:
        if not solar_cell_channel_only:
            current_array = load_cache_currents()["pixelplotting_array"][scan_order.index(scan)]
        else:
            current_array = load_cache_currents()["pixelplotting_array_direct"][scan_order.index(scan)]
    elif scan not in scan_order and crop:
        print("The scan is not amongst the voltage series scans, so it cannot be displayed with a crop.")

    else:
        if solar_cell_channel_only:
            current = counts2amps(scan, solar_cell_channel_only=True, select_channel=select_channel)
        else:
            current = counts2amps(scan, solar_cell_channel_only=False, select_channel=select_channel)
        if compare:
            counts2amps_A = current
            counts2amps_B = counts2amps(compareB[compareB_counter])

            # append NaN: scan 180 has the shape (40199,) and thus cannot be broadcast together with the other scans which have shapes of (40200,)
            if scan == 180:
                counts2amps_A = np.append(current,np.nan)
            if compareB[compareB_counter] == 180:
                counts2amps_B = np.append(counts2amps(compareB[compareB_counter]),np.nan)
            current = counts2amps_A-counts2amps_B

        if plot_any is not None:
            current_array = plot_any

        if plot_any is None:
            y_grid, z_grid, current_array = supergrid(y,z,current,1000)

            # set 1 px border to NaN
            current_array[0,:] = np.float("NaN")
            current_array[999,:] = np.float("NaN")
            current_array[:,0] = np.float("NaN")
            current_array[:,999] = np.float("NaN")

            if gaussian_blur:
                current_array = cv2.GaussianBlur(current_array, (33, 33), 0) # This accounts for the beam profile with FWHM of 105 x 108 nm.

            if scan not in [178,181,187,188,189,190] and image_registration: # scan 178 and 181 are reference scans for the warp matrices, so they don't need to be transformed, and Image Registration would fail for scan #187 to #190
                warp_matrix = np.array(ImageRegistrationDic[str(scan)])
                #current_array = np.pad(current_array, pad_width=200, mode='constant', constant_values=np.nan) # for NaN border... (not working as of now...)
                current_array = cv2.warpAffine(current_array, warp_matrix, (1000, 1000), flags = cv2.INTER_LINEAR + cv2.WARP_INVERSE_MAP, borderValue = float("NaN")).astype(np.float32)

    #current_array = np.random.rand(1000,1000)

    if not singleplot:
        # axes for new subplot
        ax = fig.add_subplot(4, 4, loop_count)
    else:
        fig, ax = plt.subplots(figsize=figsize)
        fontsize *= 1.4

    # spacing to other subplots
    plt.subplots_adjust(right=1.3, hspace=0.5, wspace=0.5)

    # y and z labels
    ax.set_xlabel("Position Y (μm)", labelpad = 0, fontsize=fontsize, fontweight="bold")
    ax.set_ylabel("Position Z (μm)", labelpad = 0, fontsize=fontsize, fontweight="bold")

    ax.tick_params(length=2/5*fontsize)

    # title
    bias_voltage = voltage_dic[scan]
    if compare:
        plt.title("Scan #" + str(scan) + " minus Scan #" + str(compareB[compareB_counter]), pad = 15, fontsize=round(9/8*fontsize), fontweight="bold")
    elif plot_any is None:
        plt.title("Scan #" + str(scan) + bias_voltage, pad = 15, fontsize=round(9/8*fontsize), fontweight="bold")

    # ticks and ticklabels need to be shifted according to Image Registration
    if scan in [179,180,182,183,184,185,186,191,192]:
        IRS_y = ImageRegistrationDic[str(scan)][0][2]
        IRS_z = ImageRegistrationDic[str(scan)][1][2]
    else:
        IRS_y = 0
        IRS_z = 0

    # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
    ax.xaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
    #ax.xaxis.set_ticks([0-IRS_y,500-IRS_y,999-IRS_y]) # if 1000 instead of 999 it won't display
    ax.xaxis.set_ticks([0,500,999])
    ax.yaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
    #ax.yaxis.set_ticks([0-IRS_z,500-IRS_z,999-IRS_z])
    ax.yaxis.set_ticks([0,500,999])
    ax.xaxis.set_minor_locator(AutoMinorLocator(5))   # 1 minor tick for each micrometer
    ax.yaxis.set_minor_locator(AutoMinorLocator(5))

    # plot the data
    im = ax.imshow(current_array, cmap=cm.afmhot, origin='lower')

    # add a colorbar as legend
    ax_divider = make_axes_locatable(ax) # Introduce a divider next to the axis already in place
    cax = ax_divider.append_axes("right", size = "5%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
    legend = colorbar(im, cax = cax) # Introduce a legend (colorbar) where the new axis has been placed
    legend.ax.tick_params(length=fontsize, labelsize=7/5*fontsize) # manipulate major tick length + label size
    legend.ax.tick_params(which="minor", length=3/5*fontsize) # manipulate minor tick length
    #cax.set_ylabel(r"$\frac{XBIC}{pixel \cdot photons_{avg}}$" + " in (nA)", rotation = -90, labelpad = 50 , fontsize=5/4*fontsize)  # Place this label next to the colorbar; labelpad denotes the distance of the colorbar
    if plot_any is None:
        if not solar_cell_channel_only:
            cax.set_ylabel("XBIC (nA)", rotation = -90, labelpad = 50 , fontsize=5/4*fontsize, fontweight="bold")  # Place this label next to the colorbar; labelpad denotes the distance of the colorbar
        else:
            cax.set_ylabel("Current (nA)", rotation = -90, labelpad = 50 , fontsize=5/4*fontsize, fontweight="bold")
    cax.yaxis.set_major_locator(AutoLocator())
    cax.yaxis.set_minor_locator(AutoMinorLocator())

    if grid:
        ax.grid(which="both")


# In[9]:


def plotXBIC2(plot_range=None, singleplots=None, compareA=None, compareB=None, grid=True, crop=True, solar_cell_channel_only=False, select_channel=None, image_registration=True, gaussian_blur=True, plot_any=None, figsize=(30,30), savepng=False, dpi=None):
    '''
    This function plots the XBIC, using data from the previously defined counts2amps() function. Look at the examples to see how to use it:

    plotXBIC(plot_range=14) will do the following:
    It will plot 14 datasets, starting with #178 and ending with #192 (and omitting 184 which is still counted).

    plotXBIC(singleplots=[178,185,192]) will do the following:
    It will plot #178, #185 and #192.

    plotXBIC(compareA=[178,178,185], compareB=[187,189,189]) will do the following:
    It will plot
    Scan 178 minus Scan 187
    Scan 178 minus Scan 189
    Scan 185 minus Scan 189.
    '''

    start = time.time() # measure time for each plot

    # the best looking fontsize is about half of the width of the figsize
    fontsize = figsize[0]//2

    # individualize titles for each scan if there is a bias voltage applied (information from LogbuchStrahlzeitPetraIII31.10.-6.11.2018.pptx, slide 54f., corresponding to handwritten logbook)
    voltage_dic = {
        178: "",
        179: "",
        180: "",
        181: " (no bias)",
        182: " (+ 100 mV)",
        183: " (- 100 mV)",
        185: " (- 200 mV)",
        186: " (+ 200 mV)",
        187: " (+ 500 mV)",
        188: "(- 500 mV)",
        189: " (- 1 V)",
        190: " (+ 600 mv)",
        191: "",
        192: ""
    }

    # create a figure for all plots
    fig = plt.figure(figsize=figsize)

    loop_count = 1  # the subplot number should be independent of omitted scans (e.g. scan #184); otherwise you would see an empty space at that position

    if type(plot_range) is int:
        singleplot=False
        if plot_range > 6:
            plot_range += 1 # scan #184 is omitted, but still counted by the for-loop
        for scan in range(178, 178 + plot_range):

            if scan == 184:
                continue # scan 184 was aborted

            plotXBIC_loop2(scan=scan,fig=fig,grid=grid,crop=crop,solar_cell_channel_only=solar_cell_channel_only,select_channel=select_channel,image_registration=image_registration,gaussian_blur=gaussian_blur,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count,singleplot=singleplot,figsize=figsize)

            loop_count += 1
        if not savepng:
            plt.show()
        end = time.time()
        print("Plotting scan 178 to scan " + str(178+plot_range) + " took " + str(round(end-start,2)) + " seconds.")

    if type(singleplots) is list:
        singleplot=True
        figsize=(10,10)
        for scan in singleplots:

            if scan == 184:
                continue # scan 184 was aborted

            plotXBIC_loop2(scan=scan,fig=fig,grid=grid,crop=crop,solar_cell_channel_only=solar_cell_channel_only,select_channel=select_channel,image_registration=image_registration,gaussian_blur=gaussian_blur,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count,singleplot=singleplot,figsize=figsize)

            loop_count += 1
        if not savepng:
            plt.show()
        end = time.time()
        print("Scan " + str(scan) + " took " + str(round(end-start,2)) + " seconds to plot.")

    if type(compareA) is list:
        singleplot=False
        compareB_counter=0
        for scan in compareA:

            plotXBIC_loop2(scan=scan,fig=fig,crop=crop,solar_cell_channel_only=solar_cell_channel_only,select_channel=select_channel,image_registration=image_registration,gaussian_blur=gaussian_blur,grid=grid,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count,compare=True,compareB=compareB,compareB_counter=compareB_counter,singleplot=singleplot,figsize=figsize)

            loop_count += 1
            compareB_counter += 1
        if not savepng:
            plt.show()
        end = time.time()
        print("Scan " + str(scan) + " minus Scan " + str(compareB[compareB_counter]) + " took " + str(round(end-start,2)) + " seconds to plot.")

    if plot_any is not None:
        singleplot=True
        figsize=(10,10)
        plotXBIC_loop2(scan=178,fig=fig,grid=grid,crop=False,solar_cell_channel_only=solar_cell_channel_only,select_channel=select_channel,image_registration=image_registration,gaussian_blur=gaussian_blur,plot_any=plot_any,fontsize=fontsize,voltage_dic=voltage_dic,loop_count=loop_count,singleplot=singleplot,figsize=figsize)

    if savepng:
        fig = plt.gcf()
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/plotXBIC_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/plotXBIC_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")


# In[10]:


def plotShockley(I_0=0.0058*1e-9, I_Ph=215.0494*1e-9, I_02=None, I_Ph2=None, compare_with_data=True, discrete=True, mean_curve=False, example=False, normalize_curves=False, dpi=None,savepng=False):
    figsize=(15,15)
    fontsize = figsize[0]//2
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)

    ax.set_xlabel("Bias voltage (mV)", labelpad = 10, fontsize=21/5*fontsize, fontweight="bold")
    ax.set_ylabel("XBIC (nA)", labelpad = 10, fontsize=21/5*fontsize, fontweight="bold")
    ax.tick_params(labelsize=15/5*fontsize, length=7/5*fontsize)

    ax.xaxis.set_ticks([-0.2,-0.1,0,0.1,0.2])
    ax.xaxis.set_ticklabels([-200,-100,0,100,200])

    ax.yaxis.set_minor_locator(MultipleLocator(5))
    ax.tick_params(which='minor', length=4/5*fontsize)
    ax.tick_params(labelsize=3*fontsize, length=1.5*fontsize, right=True)
    plt.grid(axis="y", which="minor",lw=0.25)
    plt.grid(axis="y")

    lw = 3
    ms = 12

    if not discrete:
        V = np.arange(-0.2,0.2,0.00001)
        style = "-"
    else:
        V = np.array([-0.2,-0.1,0,0.1,0.2])
        style = "-o"

    I1 = shockley_function(V, I_0 = I_0, I_Ph = I_Ph)*1e9
    if normalize_curves:
        I1 = I1/(abs(shockley_function(V=0, I_0 = I_0, I_Ph = I_Ph)*1e9))

    ax.plot(V,-I1, style, linewidth=1.3*lw, markersize=ms, label="Good diode (calculated)", color="red")

    if I_02 != None and I_Ph != None:

        I2 = shockley_function(V, I_0 = I_02, I_Ph = I_Ph2)*1e9
        if normalize_curves:
            I2 = I2/(abs(shockley_function(V=0, I_0 = I_02, I_Ph = I_Ph2)*1e9))
        ax.plot(V,-I2, style, linewidth=1.3*lw, markersize=ms, label="Bad diode (calculated)", color="blue")
        ax.legend(fontsize=6*fontsize)

    if compare_with_data:
        pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array"])
        V = [-0.2,-0.1,0,0.1,0.2]
        if not example:
            mean_arrays = []
            kmeansDic = load_cache_kmeans(kmeans_input=7)
            for comparing_curve in [1,2]:
                k_curves_values = pixelplotting_array[:, np.array(kmeansDic[str(2)]["cluster_numbers"]) == comparing_curve].T # instead of transposing, one could also specify another axis for averaging!
                mean_array = np.empty(0)
                for scan_index in range(5):
                    mean_array = np.append(mean_array, np.nanmean(k_curves_values[:,scan_index]))
                mean_arrays.append(mean_array)
            if normalize_curves:
                mean_arrays[0] = mean_arrays[0]/mean_arrays[0][2]
                mean_arrays[1] = mean_arrays[1]/mean_arrays[1][2]
            ax.plot(V,mean_arrays[0], "-o", linewidth=lw, markersize=ms, label="Good diode (measured mean)", color="pink")
            ax.plot(V,mean_arrays[1], "-o", linewidth=lw, markersize=ms, label="Bad diode (measured mean)", color="cornflowerblue")
        if example:
            y_good = pixelplotting_array[:,600, 650]
            y_bad = pixelplotting_array[:,400, 400]
            if normalize_curves:
                y_good = y_good/y_good[2]
                y_bad = y_bad/y_bad[2]
            ax.plot(V,y_good, "-o", linewidth=lw, markersize=ms, label="Good diode (measured)", color="pink")
            ax.plot(V,y_bad, "-o", linewidth=lw, markersize=ms, label="Bad diode (measured)", color="cornflowerblue")
        ax.legend(fontsize=4.4*fontsize)

        if mean_curve:
            I_mean = np.array([217.43692, 217.10754, 214.23949449431512, 210.2738, 201.62088])
            if normalize_curves:
                I_mean = I_mean/I_mean[2]
            ax.plot(V, I_mean, "-o", linewidth=lw, markersize=ms, label="Mean IV curve", color="k")
            ax.legend(fontsize=4*fontsize)
        if example:
            ax.legend(fontsize=3.5*fontsize)

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/ShockleyCurve_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/ShockleyCurve_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")

popt = plotIV(return_fitting=True)[0]
I_0 = popt[0]
I_Ph = popt[1]

plotShockley(I_0=I_0, I_Ph=I_Ph, I_02 =2*I_0, I_Ph2=I_Ph, discrete=False, mean_curve=False, example=False, normalize_curves=True, savepng=True)


# In[358]:


plotClusterCurves(kmeans_input=7)
plotClusterCurves(kmeans_input=7, normalize_curves=True)


# In[11]:


def plotIV(coordinate=None, sample_size=100, set_region=(0,999,0,999), showPosition=False, inset=False, title=True, figsize=(30,30), savepng=False, dpi=None, savepdf=False, annotations=False, data_processing=1, return_only=False, return_fitting=False):

    if inset:
        showPosition = True

    start = time.time()
    # voltages
    voltages =      [-1000, -500, -200, -100, 0, 100, 200, 500, 600]
    # scans (ordered by voltages)
    scans_ordered = [ 189,  188,  185,   183,181,182, 186, 187, 190]

    # convert list to array
    voltages, scans_ordered = np.array(voltages), np.array(scans_ordered)

    current_array_list = []
    current_mean = []

    # plot either with or without negative values [EDIT: deleted from the counts2amps function!] and select if the solar cell channel should be selected for the current
    options_dic = {
        1: (False, False, "Lock in R , ohne Negativwerte"),
        2: (False, True, "Lock in R, mit Negativwerten"),
        3: (True, False, "solar cell channel, ohne Negativwerte"),
        4: (True, True, "solar cell channel, mit Negativwerten")
        }
    if not return_only and not return_fitting:
        print(options_dic[data_processing][2])
    A = 2     # plot only from scan_ordered[A] to ...
    B = -2    # ...            scan_ordered[B]
    counter = 0
    for scan in scans_ordered[A:B]:
        positions = load_h5(scan, "positions")
        y_axis_h5 = positions["/data/encoder_fast/data"]
        z_axis_h5 = positions["/data/encoder_slow/data"]
        y_axis = np.array(y_axis_h5)
        z_axis = np.array(z_axis_h5)
        current = counts2amps(scan,solar_cell_channel_only=options_dic[data_processing][0])

        y_grid, z_grid, current_array = supergrid(y_axis,z_axis,current,1000)

        # set 1 px border to NaN
        current_array[0,:] = np.float("NaN")
        current_array[999,:] = np.float("NaN")
        current_array[:,0] = np.float("NaN")
        current_array[:,999] = np.float("NaN")

        current_array = cv2.GaussianBlur(current_array, (33, 33), 0) # This accounts for the beam profile with FWHM of 105 x 108 nm.

        if scan != 181:
            warp_matrix = np.array(ImageRegistrationDic[str(scan)])
            current_array = cv2.warpAffine(current_array, warp_matrix, (1000, 1000), flags = cv2.INTER_LINEAR + cv2.WARP_INVERSE_MAP, borderValue = float("NaN")).astype(np.float32)

        current_array_list.append(current_array)

        counter += 1

    pixelplotting_array = np.stack(current_array_list)


    # have NaN values at same position for all scans
    counter = 0
    for grid in current_array_list:
        grid[np.isnan(pixelplotting_array).any(axis=0)] = np.float("NaN")

        current_mean.append(np.nanmean(grid))

        if not return_only and not return_fitting:
            print(f"Mean current for scan {scan} is {str(round(current_mean[counter],4))} nA. Its min/max current is {str(round(np.nanmin(current_array),4))}/{str(round(np.nanmax(current_array),4))} nA.") # the values are only shown to a few decimal places if not using e.g. {round(current_mean[counter],4):.20f}
        counter += 1
    pixelplotting_array = np.stack(current_array_list)

    if return_only:
        return pixelplotting_array, current_mean

    #fitting
    popt, pcov = curve_fit(lambda V, I_0, I_Ph: shockley_function(V, I_0, I_Ph)*(-1e9), voltages[A:B]/1000, current_mean, p0=[5e-12,current_mean[2]*1e-9], bounds=([1e-13,(current_mean[2]-3)*1e-9],[1e-11,(current_mean[2]+3)*1e-9]))
    if return_fitting:
        return popt, pcov

    fontsize = 1.5*figsize[0]//2
    fig = plt.figure(figsize=figsize)
    ax1 = fig.add_subplot(221)

    # y and z labels
    ax1.set_xlabel("Bias voltage (mV)", labelpad = 10, fontsize=7/5*fontsize, fontweight="bold")
    ax1.set_ylabel("XBIC (nA)", labelpad = 10, fontsize=7/5*fontsize, fontweight="bold")
    if title:
        plt.title("IV curve(s) for single pixel(s)", pad = 15, fontsize=round(7/5*fontsize), fontweight="bold")
    ax1.tick_params(labelsize=7/5*fontsize, length=fontsize, right=True)
    ax1.set_xticks(voltages[A:B])

    if data_processing == 3: # else the ticks exceed Locator.MAXTICKS
        major_interval = 1000
    else:
        major_interval = 10
    minor_interval = major_interval/5
    ax1.yaxis.set_major_locator(MultipleLocator(major_interval))
    ax1.yaxis.set_minor_locator(MultipleLocator(minor_interval))
    ax1.tick_params(which='minor', length=4/5*fontsize, right=True)
    plt.grid(axis="y", which="minor",lw=0.25)
    plt.grid(axis="y")

    # allows to plot only a specific location on the y-z current array (else plot each 'sample_size'-th IV curve):
    if coordinate != None:
        ax1.plot(voltages[A:B], pixelplotting_array[:,coordinate[0],coordinate[1]], "-o", markersize=8, linewidth=1.2, color="blue", label="IV curve at given (Y,Z)")

        slope, intercept = kmeans_clustering(kmeans_input=8, return_only=True)
        slope = np.reshape(slope, (1000,1000))
        intercept = np.reshape(intercept, (1000,1000))
        ax1.plot(voltages[A:B], slope[coordinate[0],coordinate[1]]*voltages[A:B]+intercept[coordinate[0],coordinate[1]], label="Fitted line")
        ax1.legend(fontsize=7/5*fontsize)

        if showPosition:
            if inset:
                # Create a set of inset Axes: these should fill the bounding box allocated to them.
                ax1_2 = plt.axes([0,0,1,1])
                # Manually set the position and relative size of the inset axes within ax1
                ip = InsetPosition(ax1, [0.2,0.2,0.3,0.3])
                ax1_2.set_axes_locator(ip)
            else:
                # Create a new subplot
                ax1_2 = fig.add_subplot(223)
            # ticks and ticklabels need to be shifted according to Image Registration
            scan = scans_ordered[B-1] # show the scan of the last IV curve point as an example
            IRS_y = ImageRegistrationDic[str(scan)][0][2] # IRS = Image Registration Shift (in y or z)
            IRS_z = ImageRegistrationDic[str(scan)][1][2]
            # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
            ax1_2.xaxis.set_ticklabels([0,5,10],fontsize=round(9/5*fontsize))
            #ax1_2.xaxis.set_ticks([0-IRS_y,500-IRS_y,999-IRS_y]) # if 1000 instead of 999 it won't display
            ax1_2.xaxis.set_ticks([0,500,999])
            ax1_2.yaxis.set_ticklabels([0,5,10],fontsize=round(9/5*fontsize))
            ax1_2.yaxis.set_ticks([0-IRS_z,500-IRS_z,999-IRS_z])
            ax1_2.xaxis.set_minor_locator(AutoMinorLocator(5))   # 1 minor tick for each micrometer
            ax1_2.yaxis.set_minor_locator(AutoMinorLocator(5))

        if showPosition:
            ax1_2.imshow(current_array, cmap=cm.afmhot, origin='lower')
            ax1_2.plot([coordinate[0]],coordinate[1], "bo", markersize=8)
            # y and z labels
            ax1_2.set_xlabel("Position Y [μm]", labelpad = 10, fontsize=fontsize)
            ax1_2.set_ylabel("Position Z [μm]", labelpad = 10, fontsize=fontsize)

        print("Number of NaN values on shown IV curve:", str(len(np.where(np.isnan(pixelplotting_array[:,coordinate[0],coordinate[1]]))[0])))

    if coordinate == None:
        countnumberlines = 0
        countNaN = 0

        # define the colormap for the curves
        cmap = cm.gist_rainbow
        norm = colors.Normalize(vmin=1, vmax=sample_size)

        for z in np.linspace(set_region[0],set_region[1],sample_size**0.5,dtype=int):
            for y in np.linspace(set_region[2],set_region[3],sample_size**0.5,dtype=int):
                #ax1.plot(voltages[A:B], pixelplotting_array[:,y,z], "-o", markersize=8, linewidth=1.2, color=cmap(norm(countnumberlines)))
                ax1.plot(voltages[A:B], pixelplotting_array[:,y,z], "-", linewidth=0.5, color="blue")
                ax1.plot(voltages[A:B], current_mean, "-o", markersize=12, linewidth=4, color="blue")

                if showPosition:
                    if inset:
                        # Create a set of inset Axes: these should fill the bounding box allocated to them.
                        ax1_2 = plt.axes([0,0,1,1])
                        # Manually set the position and relative size of the inset axes within ax1
                        ip = InsetPosition(ax1, [0.2,0.05,figsize[0]/120,figsize[0]/120])
                        ax1_2.set_axes_locator(ip)
                    else:
                        # Create a new subplot
                        ax1_2 = fig.add_subplot(223)
                    # ticks and ticklabels need to be shifted according to Image Registration
                    scan = scans_ordered[B-1] # show the scan of the last IV curve point as an example
                    IRS_y = ImageRegistrationDic[str(scan)][0][2] # IRS = Image Registration Shift (in y or z)
                    IRS_z = ImageRegistrationDic[str(scan)][1][2]
                    # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
                    ax1_2.xaxis.set_ticklabels([0,5,10],fontsize=round(fontsize))
                    #ax1_2.xaxis.set_ticks([0-IRS_y,500-IRS_y,999-IRS_y]) # if 1000 instead of 999 it won't display
                    ax1_2.xaxis.set_ticks([0,500,999])
                    ax1_2.yaxis.set_ticklabels([0,5,10],fontsize=round(fontsize))
                    #ax1_2.yaxis.set_ticks([0-IRS_z,500-IRS_z,999-IRS_z])
                    ax1_2.yaxis.set_ticks([0,500,999])
                    ax1_2.xaxis.set_minor_locator(AutoMinorLocator(5))   # 1 minor tick for each micrometer
                    ax1_2.yaxis.set_minor_locator(AutoMinorLocator(5))

                    ax1_2.imshow(current_array, cmap=cm.afmhot, origin='lower')
                    #ax1_2.plot(y,z, "bo", markersize=8, color=cmap(norm(countnumberlines)))
                    ax1_2.plot(y,z, "bo", markersize=8, color="k")

                    # y and z labels
                    ax1_2.set_xlabel("Position Y [μm]", labelpad = 10, fontsize=fontsize)
                    ax1_2.set_ylabel("Position Z [μm]", labelpad = 10, fontsize=fontsize)
                if not np.isnan(np.sum(pixelplotting_array[:,y,z])):
                    countnumberlines += 1
                countNaN += len(np.where(np.isnan(pixelplotting_array[:,y,z]))[0])
        print("Number of IV curves shown in plot:", str(countnumberlines))
        print("Number of NaN values on shown IV curves:", str(countNaN))

    ax2 = fig.add_subplot(222)
    ax2.plot(voltages[A:B], current_mean, "-o", markersize=8, linewidth=1.2, color="blue",label="Mean XBIC for all (y,z) positions (nA)")
    ax2.legend(fontsize=7/5*fontsize)

    ax2.plot(voltages[A:B], shockley_function(voltages[A:B]/1000,popt[0],popt[1])*(-1e9), "-o", markersize=6, linewidth=0.6, color="red", label="Fitted IV curve")
    ax2.legend(fontsize=7/5*fontsize)
    print("-----\nActually measured:\nShort-circuit current:", str(round(current_mean[2],4)), "nA\n-----\nResult of fitting:\nSaturation current I_0:", str(round(popt[0]*1e9,4)), "nA\nPhotocurrent I_Ph:", str(round(popt[1]*1e9,4)),
          "nA\nOpen Circuit Voltage V_OC:", str(round(shockley_function_VOC(popt[0],popt[1])*1e3,4)), "mV\n-----")

    if ax1.get_ylim()[0] < 202 and ax1.get_ylim()[1] > 218: # this condition makes sure that the min and max mean current values are still shown
        ax2.set_ylim(ax1.get_ylim()) # make sure that the current interval is the same so that the shape of the curves can be compared
    ax2.set_xlabel("Bias voltage (mV)", labelpad = 10, fontsize=7/5*fontsize, fontweight="bold")
    ax2.set_ylabel("XBIC (nA)", labelpad = 10, fontsize=7/5*fontsize, fontweight="bold")
    plt.title("Mean IV curve", pad = 15, fontsize=round(7/5*fontsize), fontweight="bold")
    ax2.tick_params(labelsize=7/5*fontsize, length=fontsize, right=True)
    ax2.set_xticks(voltages[A:B])

    ax2.yaxis.set_major_locator(MultipleLocator(major_interval))
    ax2.yaxis.set_minor_locator(MultipleLocator(minor_interval))
    ax2.tick_params(which='minor', length=4/5*fontsize, right=True)
    plt.grid(axis="y", which="minor",lw=0.25)
    plt.grid(axis="y")

    if annotations:
        for i in range(len(current_mean)):
            ax2.annotate("Scan " + str(scans_ordered[i+A]), xy=(voltages[i+A], current_mean[i]), xytext=(voltages[i+A], current_mean[i]+1.3), arrowprops=dict(facecolor='black', shrink=0.12, width=2, headwidth=8))

    '''#distance of ticklabels from ticks
    ax1.tick_params(axis='both', which='major', pad=25)
    ax1_2.tick_params(axis='both', which='major', pad=25)
    ax2.tick_params(axis='both', which='major', pad=25)'''

    plt.subplots_adjust(wspace=0.35) # adjust the width between the subplots

    plt.show()

    end = time.time()
    print("Plotting took {} seconds.".format(str(round(end-start,2))))


    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if coordinate != None:
            coord_str = f"{coordinate[0]}-{coordinate[1]}_"
        else:
            coord_str = ""
        if dpi is None:
            fig.savefig("savefig/IV_curve_" + coord_str + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/IV_curve_" + coord_str + dt_string + ".png", dpi=dpi, bbox_inches="tight")
    if savepdf:
        fig.savefig("savefig/IV_curve_" + dt_string + ".pdf", format="pdf")


# In[12]:


def shockley_function(V,I_0,I_Ph): # arguments and output in SI units
    return I_0*(np.exp(constants.e*V/(constants.k*300))-1)-I_Ph


# In[13]:


def shockley_function_VOC(I_0,I_Ph): # arguments and output in SI units
    return constants.k*300*np.log(I_Ph/I_0 + 1)/constants.e


# In[445]:


plotClusterCurves(kmeans_input=8, normalize_curves=True, title="")


# In[14]:


def plotClusterCurves(k=2, k_curves=1, kmeans_input=1, NaN=False, sample_size=100, data_processing=1, compare=True, normalize_curves=False, unnecessary_reference=False, suptitle="", title=True, dpi=None, savepng=False, savepng_str=None):
    start = time.time()

    #pixelplotting_array = plotIV(data_processing=data_processing, return_only=True)[1]
    pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array"])
    kmeansDic = load_cache_kmeans(kmeans_input=kmeans_input, data_processing=data_processing)
    #kmeansDic = {"2": {"cluster_numbers": plotMedianCorrelation(return_only=True)[0]}}
    scan_order = [185,183,181,182,186]
    voltages =      [-1000, -500, -200, -100, 0, 100, 200, 500, 600]
    A = 2     # plot only from scan_ordered[A] to ...
    B = -2    # ...            scan_ordered[B]

    if unnecessary_reference:
        figsize=(20,10)
    else:
        figsize=(10*1.5,10*1.5)
    fontsize = figsize[0]//2
    fig = plt.figure(figsize=figsize)
    if unnecessary_reference:
        ax1 = fig.add_subplot(121)
    else:
        ax1 = fig.add_subplot(111)

    if unnecessary_reference:
        # use variables from plotIV function
        #current_array = plotIV(data_processing=data_processing, return_only=True)[0][-1] # select scan of last IV curve point
        current_array = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(186)])
        '''
        # ticks and ticklabels need to be shifted according to Image Registration
        scan = 186 # show the scan of the last IV curve point as an example <-- here it is scan 186, at B = -2 in the plotIV() function
        IRS_y = ImageRegistrationDic[str(scan)][0][2] # IRS = Image Registration Shift (in y or z)
        IRS_z = ImageRegistrationDic[str(scan)][1][2]
        # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
        ax1.xaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
        ax1.xaxis.set_ticks([0-IRS_y,500-IRS_y,999-IRS_y]) # if 1000 instead of 999 it won't display
        ax1.yaxis.set_ticklabels([0,5,10],fontsize=round(4/5*fontsize))
        ax1.yaxis.set_ticks([0-IRS_z,500-IRS_z,999-IRS_z])
        ax1.xaxis.set_minor_locator(AutoMinorLocator(5))   # 1 minor tick for each micrometer
        ax1.yaxis.set_minor_locator(AutoMinorLocator(5))
        '''
        ax1.imshow(current_array, cmap=cm.afmhot, origin='lower')
        # y and z labels
        ax1.set_xlabel("Position Y [μm]", labelpad = 0, fontsize=fontsize)
        ax1.set_ylabel("Position Z [μm]", labelpad = 0, fontsize=fontsize)

    if NaN: # this option was only used, because there was something wrong with the plotClusterCurve "if NaN" condition itself, adding weird NaN values to the current grid
        NaN_where = np.where(np.isnan(pixelplotting_array))
        NaN_where = np.flip(NaN_where,0) # np.flip is necessary, because np.isnan reads from bottom to top
        ax1.scatter(NaN_where[0],NaN_where[1])
        ax1.set_title("Position of NaN values (in blue)", pad = 15, fontsize=round(9/8*fontsize))

    if k != None and unnecessary_reference:
        for z in np.linspace(0,999,sample_size**0.5,dtype=int):
            for y in np.linspace(0,999,sample_size**0.5,dtype=int):
                if np.array(kmeansDic[str(k)]["cluster_numbers"])[y][z] == k_curves:
                    ax1.scatter(y,z,color="red")
                else:
                    ax1.scatter(y,z,color="black")
        ax1.set_title("Position of the displayed curve", pad = 15, fontsize=round(9/8*fontsize))


    if k_curves != None: # plot curves belonging to the k_th cluster
        if unnecessary_reference:
            ax2 = fig.add_subplot(122)
        else:
            ax2 = fig.add_subplot(111)

        ax2.set_xlabel("Bias voltage (mV)", labelpad = 10, fontsize=21/5*fontsize, fontweight="bold")
        ax2.set_ylabel("XBIC (nA)", labelpad = 10, fontsize=21/5*fontsize, fontweight="bold")
        ax2.tick_params(labelsize=15/5*fontsize, length=7/5*fontsize)

        ax2.xaxis.set_ticks([-200,-100,0,100,200])

        ax2.yaxis.set_minor_locator(MultipleLocator(5))
        ax2.tick_params(which='minor', length=4/5*fontsize)
        plt.grid(axis="y", which="minor",lw=0.25)
        plt.grid(axis="y")

        k_curves_values = pixelplotting_array[:, np.array(kmeansDic[str(k)]["cluster_numbers"]) == k_curves].T # instead of transposing, one could also specify another axis for averaging!
        mean_array = np.empty(0)
        for scan_index in range(5):
            mean_array = np.append(mean_array, np.nanmean(k_curves_values[:,scan_index]))
        print(f"Mean currents for curve #{k_curves} (with k = {k}):", str(np.around(mean_array,2)))
        countnumberlines_list = []
        countnumberlines = 0
        for z in np.linspace(0,999,sample_size**0.5,dtype=int):
            for y in np.linspace(0,999,sample_size**0.5,dtype=int):
                if not math.isnan(np.array(kmeansDic[str(k)]["cluster_numbers"])[y][z]):
                    if int(np.array(kmeansDic[str(k)]["cluster_numbers"])[y][z]) == k_curves:
                        if normalize_curves:
                            ax2.plot(voltages[A:B], pixelplotting_array[:,y,z]/pixelplotting_array[2,y,z], linewidth = 0.5, color="pink")
                        else:
                            ax2.plot(voltages[A:B], pixelplotting_array[:,y,z], linewidth = 0.5, color="pink")
                        countnumberlines += 1
        countnumberlines_list.append(countnumberlines)
        if normalize_curves:
            ax2.plot(voltages[A:B], mean_array/mean_array[2], "-o", markeredgewidth=3, linewidth = 2.0, color="red", label=f"Group {k_curves}")
        else:
            ax2.plot(voltages[A:B], mean_array, "-o", markeredgewidth=3, linewidth = 2.0, color="red", label=f"Group {k_curves}")
        plt.legend()

        if compare:
            colors = ["blue", "green", "brown", "yellow"]
            color_counter = 0
            for comparing_curve in range(1,k+1):
                if comparing_curve == k_curves: # don't plot again the red curve for the k_curve that is already selected
                    continue
                countnumberlines = 0
                for z in np.linspace(0,999,sample_size**0.5,dtype=int):
                    for y in np.linspace(0,999,sample_size**0.5,dtype=int):
                        if not math.isnan(np.array(kmeansDic[str(k)]["cluster_numbers"])[y][z]):
                            if int(np.array(kmeansDic[str(k)]["cluster_numbers"])[y][z]) == comparing_curve:
                                if normalize_curves:
                                    ax2.plot(voltages[A:B], pixelplotting_array[:,y,z]/pixelplotting_array[2,y,z], linewidth = 0.5, color="cornflowerblue")
                                else:
                                    ax2.plot(voltages[A:B], pixelplotting_array[:,y,z], linewidth = 0.5, color="cornflowerblue")
                                countnumberlines += 1
                countnumberlines_list.append(countnumberlines)

                k_curves_values = pixelplotting_array[:, np.array(kmeansDic[str(k)]["cluster_numbers"]) == comparing_curve].T # instead of transposing, one could also specify another axis for averaging!
                mean_array = np.empty(0)
                for scan_index in range(5):
                    mean_array = np.append(mean_array, np.nanmean(k_curves_values[:,scan_index]))
                if normalize_curves:
                    ax2.plot(voltages[A:B], mean_array/mean_array[2], "-o", markeredgewidth=3, linewidth = 2.0, color=colors[color_counter], label=f"Group {comparing_curve}")
                else:
                    ax2.plot(voltages[A:B], mean_array, "-o", markeredgewidth=3, linewidth = 2.0, color=colors[color_counter], label=f"Group {comparing_curve}")
                color_counter += 1
                ax2.legend(fontsize=4*fontsize, prop={'size': 4*fontsize})
                print("For comparison, the mean curve for group", str(comparing_curve), "has been plotted:", str(np.around(mean_array,2)))
            if k == 2:
                s_word = "group"
            else:
                s_word = "groups"

            if unnecessary_reference:
                title_fontsize = round(9/8*fontsize)
            else:
                title_fontsize = round(18/8*fontsize)
            if title:
                ax2.set_title(f"IV curves for group {k_curves} (with k = {k}), compared to the IV curves of the other {s_word}", pad = 15, fontsize=title_fontsize, fontweight="bold")

        countnumberlines_str = f"Number of IV curves shown in plot: {countnumberlines_list[0]} (group {k_curves})"
        if compare:
            comparing_curve_counter = 1
            for comparing_curve in range(1,k+1):
                if comparing_curve == k_curves:
                    comparing_curve_counter += 1
                    continue
                countnumberlines_str += f", {countnumberlines_list[comparing_curve_counter-1]} (group {comparing_curve_counter})"
                comparing_curve_counter
        print(countnumberlines_str)

    plt.suptitle(suptitle)
    plt.show()
    end = time.time()
    print("Plotting took {} seconds.".format(str(round(end-start,2))))

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            if savepng_str is None:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterCurves_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
            else:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterCurves_" + savepng_str + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            if savepng_str is None:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterCurves_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")
            else:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterCurves_" + savepng_str + ".png", dpi=dpi, bbox_inches="tight")


# In[15]:


def plotClusterPosition(k=2, kmeans_input=1, data_processing=1, title="", dpi=None, suptitle="", no_suptitle=False, grid=True, plot_any=None, savepng=False, savepng_str=None):
    start = time.time()

    kmeansDic = load_cache_kmeans(kmeans_input = kmeans_input, data_processing=data_processing)

    figsize = (15,15)
    fontsize = figsize[0]
    #fig = plt.figure(figsize=figsize)
    #ax = fig.add_subplot(111)
    fig, ax = plt.subplots(figsize=figsize)
    ax.set_aspect("equal")

    cmap = colors.ListedColormap(["yellow", "black"])
    bounds=[0.5,1.5,2.5]
    norm = colors.BoundaryNorm(bounds, cmap.N)

    if plot_any is not None:
        cmap = colors.ListedColormap(["white", "black"])
        groups = plot_any
    else:
        groups = np.array(kmeansDic[str(k)]["cluster_numbers"])

    im = plt.pcolormesh(groups, cmap=cmap)

    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.25)
    cbar = plt.colorbar(im, cax, cmap=cmap, norm=norm, boundaries=bounds, ticks=[1, 2])
    cbar.ax.tick_params(length=7/5*fontsize, labelsize=2*fontsize)
    cax.set_ylabel("Cluster Groups", rotation = -90, labelpad = 50 , fontsize=3*fontsize, fontweight="bold")

    #im = ax.imshow(np.array(kmeansDic[str(k)]["cluster_numbers"]), interpolation='nearest', cmap=cm.afmhot, origin='lower')

    ax.set_xlabel("Position Y (μm)", labelpad = 10, fontsize=15/5*fontsize, fontweight="bold")
    ax.set_ylabel("Position Z (μm)", labelpad = 10, fontsize=15/5*fontsize, fontweight="bold")
    ax.set_title(title, pad = 15, fontsize=15/5*fontsize, fontweight="bold")
    ax.tick_params(length=7/5*fontsize)
    ax.tick_params(which='minor', length=4/5*fontsize)

    # ticks and ticklabels need to be shifted according to Image Registration
    IRS_y_list = []
    IRS_z_list = []
    for scan in [185,183,182,186]:
        IRS_y_list.append(ImageRegistrationDic[str(scan)][0][2])
        IRS_z_list.append(ImageRegistrationDic[str(scan)][1][2])
    IRS_y = max(IRS_y_list) # IRS = Image Registration Shift (in y or z)
    IRS_z = min(IRS_z_list)
    # individual ticks: it's an extrapolated plot from 201x201 to 1000x1000 pixels, but this corresponds to 10x10 micrometers on the solar cell
    ax.xaxis.set_ticklabels([0,5,10],fontsize=round(12/5*fontsize))
    #ax.xaxis.set_ticks([0-IRS_y,500-IRS_y,999-IRS_y]) # if 1000 instead of 999 it won't display
    ax.xaxis.set_ticks([0,500,999])
    ax.yaxis.set_ticklabels([0,5,10],fontsize=round(12/5*fontsize))
    #ax.yaxis.set_ticks([0-IRS_z,500-IRS_z,999-IRS_z])
    ax.yaxis.set_ticks([0,500,999])
    ax.xaxis.set_minor_locator(AutoMinorLocator(5))   # 1 minor tick for each micrometer
    ax.yaxis.set_minor_locator(AutoMinorLocator(5))

    if not no_suptitle:
        MinMeanMax_str = ""
        for group_index in range(1,k+1):
            Min = int(round(kmeansDic[str(k)]["threshold"][group_index-1][0]))
            Mean = int(round(kmeansDic[str(k)]["threshold"][group_index-1][1]))
            Max = int(round(kmeansDic[str(k)]["threshold"][group_index-1][2]))
            MinMeanMax_str += f"\nMin/Mean/Max current for group {group_index} is {Min}/{Mean}/{Max} nA."
        plt.suptitle(suptitle+MinMeanMax_str)

    '''
    # add a colorbar as legend
    ax_divider = make_axes_locatable(ax) # Introduce a divider next to the axis already in place
    cax = ax_divider.append_axes("right", size = "15%", pad = "5%") # Append a new axis object with name "cax" to the divider. Width ("size") and distance to the 2D-Plot ("pad") can be given in % of the width of the x axis.
    legend = colorbar(im, cax = cax) # Introduce a legend (colorbar) where the new axis has been placed

    cax.set_ylabel("Number of cluster", rotation = -90, labelpad = 50 , style = "italic", fontsize=5/4*fontsize)  # Place this label next to the colorbar; labelpad denotes the distance of the colorbar
    cax.yaxis.set_major_locator(AutoLocator())
    cax.yaxis.set_minor_locator(AutoMinorLocator())
    '''
    if grid:
        ax.grid(which="both")
    plt.show()
    end = time.time()
    print("Plotting took {} seconds.".format(str(round(end-start,2))))

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            if savepng_str is None:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterPosition_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
            else:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterPosition_" + savepng_str + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            if savepng_str is None:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterPosition_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")
            else:
                fig.savefig("savefig/different_inputs_for_clustering/ClusterPosition_" + savepng_str + ".png", dpi=dpi, bbox_inches="tight")


# In[385]:


plotXBIC2(singleplots=[181])


# In[380]:


arr = kmeans_clustering(kmeans_input=8, return_only=True)[0]
arr = np.reshape(arr, (1000,1000))
plotXBIC2(plot_any=arr)


# In[16]:


def kmeans_clustering(data_processing=1, k=2, random_state=22, kmeans_input=1, normalize_samples=False, sample_size=50000, silhouette=True, return_only=False):
    '''This function applies k-means clustering under varying data input. Basis is the pixelplotting_array which contains the five 1000x1000 current grids under five different bias voltages. The argument data_processing refers to what signal path we seek; default is the R value of the lock-in amplifier.
    As an output we get a 1000x1000 grid with all the cluster numbers, starting from 1 and going to k, the different scores for finding the optimal k, and the threshold which refers to the min/max values of each group.'''
    #pixelplotting_array = plotIV(data_processing=data_processing, return_only=True)[1]
    if data_processing == 1:
        pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array"])
    if data_processing == 3:
        pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array_direct"])
    pixelvectors_original = pixelplotting_array.reshape(5, 1000000).T # each 5D vector represents 1 pixel with 5 different bias voltages from - to + 200 mV

    # 5 absolute values
    if kmeans_input == 1:
        pixelvectors = pixelvectors_original

    # 4 absolute values (from -200 to +100)
    if kmeans_input == 1.1:
        pixelvectors = pixelvectors_original.T[:4].T

    # 4 slopes
    if kmeans_input == 2:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4]
                                ]).T

    # 4 slopes + 1 absolute value (average)
    if kmeans_input == 3:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        np.nanmean(pixelvectors,axis=1)
                                ]).T

    # 4 slopes + 1 absolute value (-200 mV)
    if kmeans_input == 3.1:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        pixelvectors.T[0]
                                ]).T

    # 4 slopes + 1 absolute value (-100 mV)
    if kmeans_input == 3.2:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        pixelvectors.T[1]
                                ]).T

    # 4 slopes + 1 absolute value (0 mV)
    if kmeans_input == 3.3:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        pixelvectors.T[2]
                                ]).T

        # 4 slopes + 1 absolute value (+100 mV)
    if kmeans_input == 3.4:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        pixelvectors.T[3]
                                ]).T

    # 4 slopes + 1 absolute value (+200 mV)
    if kmeans_input == 3.5:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1],
        pixelvectors.T[1]-pixelvectors.T[2],
        pixelvectors.T[2]-pixelvectors.T[3],
        pixelvectors.T[3]-pixelvectors.T[4],
        pixelvectors.T[4]
                                ]).T

    # 2 slopes (-200 to +200, +100 to +200)
    if kmeans_input == 4:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[4],
        pixelvectors.T[3]-pixelvectors.T[4]
                                ]).T

    # only I(V=0)
    if kmeans_input == 5:
        pixelvectors = np.empty((1000000,5))
        pixelvectors[np.isnan(pixelvectors_original).any(axis=1)] = np.float("NaN")
        pixelvectors[~np.isnan(pixelvectors_original).any(axis=1)] = pixelvectors_original[np.newaxis,:].T[2][~np.isnan(pixelvectors_original).any(axis=1)]

        pixelvectors = np.array([
        pixelvectors.T[2]
                                ]).T

    # only 1 slope (-200 to +200)
    if kmeans_input == 6:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[4]
                                ]).T

    # only 1 slope (+100 to +200)
    if kmeans_input == 7:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[3]-pixelvectors.T[4]
                                ]).T

    # only 1 slope (-200 to -100)
    if kmeans_input == 7.1:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[0]-pixelvectors.T[1]
                                ]).T

    # only 1 slope (-100 to 0)
    if kmeans_input == 7.2:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[1]-pixelvectors.T[2]
                                ]).T

    # only 1 slope (0 to +100)
    if kmeans_input == 7.3:
        pixelvectors = pixelvectors_original
        pixelvectors = np.array([
        pixelvectors.T[2]-pixelvectors.T[3]
                                ]).T

    # linear regression for all five points
    if kmeans_input == 8:
        pixelvectors = pixelvectors_original
        slope, intercept = np.polyfit([-200,-100,0,100,200], pixelvectors[~np.isnan(pixelvectors).any(axis=1)].T,deg=1)
        pixelvectors_new = np.empty((1000000,1))
        pixelvectors_new[~np.isnan(pixelvectors).any(axis=1)] = slope[np.newaxis,:].T
        pixelvectors_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
        pixelvectors = pixelvectors_new

        if return_only: # also return the intercept
            intercept_new = np.empty((1000000,1))
            intercept_new[~np.isnan(pixelvectors).any(axis=1)] = intercept[np.newaxis,:].T
            intercept_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
            intercept = intercept_new

    # linear regression forall five points + intercept
    if kmeans_input == 8.1:
        pixelvectors = pixelvectors_original
        slope, intercept = np.polyfit([-200,-100,0,100,200], pixelvectors[~np.isnan(pixelvectors).any(axis=1)].T,deg=1)
        slope = slope[np.newaxis,:] # add 1 dimension
        intercept = intercept[np.newaxis,:] # add 1 dimension
        pixelvectors_new = np.empty((1000000,2))
        pixelvectors_new[~np.isnan(pixelvectors).any(axis=1)] = np.concatenate((slope,intercept),axis=0).T
        pixelvectors_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
        pixelvectors = pixelvectors_new

    # linear regression for -100 mV to +100 mV
    if kmeans_input == 9:
        pixelvectors = pixelvectors_original
        slope, intercept = np.polyfit([-100,0,100], pixelvectors[~np.isnan(pixelvectors).any(axis=1)][:, 1:4].T,deg=1)
        pixelvectors_new = np.empty((1000000,1))
        pixelvectors_new[~np.isnan(pixelvectors).any(axis=1)] = slope[np.newaxis,:].T
        pixelvectors_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
        pixelvectors = pixelvectors_new

        if return_only: # also return the intercept
            intercept_new = np.empty((1000000,1))
            intercept_new[~np.isnan(pixelvectors).any(axis=1)] = intercept[np.newaxis,:].T
            intercept_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
            intercept = intercept_new

    # linear regression for -200 mV to +100 mV
    if kmeans_input == 10:
        pixelvectors = pixelvectors_original
        slope, intercept = np.polyfit([-200,-100,0,100], pixelvectors[~np.isnan(pixelvectors).any(axis=1)][:, 0:4].T,deg=1)
        pixelvectors_new = np.empty((1000000,1))
        pixelvectors_new[~np.isnan(pixelvectors).any(axis=1)] = slope[np.newaxis,:].T
        pixelvectors_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
        pixelvectors = pixelvectors_new

        if return_only: # also return the intercept
            intercept_new = np.empty((1000000,1))
            intercept_new[~np.isnan(pixelvectors).any(axis=1)] = intercept[np.newaxis,:].T
            intercept_new[np.isnan(pixelvectors).any(axis=1)] = np.float("NaN")
            intercept = intercept_new

    if return_only:
        if kmeans_input in [8,8.1,9,10]:
            return pixelvectors, intercept
        else:
            return pixelvectors

    data = pixelvectors[~np.isnan(pixelvectors).any(axis=1)] # delete all vectors which contain NaN values

    if not normalize_samples: # maximum of ALL values is 1 and minimum of ALL values is 0
        scaler = MinMaxScaler()
        scaler.fit(data)
        data_transformed = scaler.transform(data)
    else: # maximum of each curve is 1 and minimum of each curve is 0
        if kmeans_input in [5,6,7]:
            return "Error! For kmeans_input = 5,6,7 you cannot normalize the samples, as they will all be set to 0.01, leading kmeans to put all samples in group 1."
        data_transformed = data/np.linalg.norm(data, ord=2, axis=1, keepdims=True)
    km = KMeans(n_clusters = k)
    groups = km.fit_predict(data_transformed) + 1 # Adding 1 calls the first group "group 1" instead of "group 0". Also, it avoids confusion with the zeros_array
    number_sqdist = km.inertia_
    if silhouette: # silhouette slows down the function extremely, so it is given as optional
        silhouette = silhouette_score(data_transformed, groups, sample_size=sample_size)
    else:
        silhouette = float("NaN")
    calinski_harabasz = calinski_harabasz_score(data_transformed, groups)

    no_data = np.isnan(pixelvectors).any(axis=1) # returns a 1D array with True if a vector contains a NaN value, else False
    no_data_count = no_data.sum() # counts all the vectors with NaNs
    data_count = 1000000-no_data_count

    zeros_array = np.zeros(1000000)
    zeros_array[~no_data] = groups # fills in all the group numbers where there are no NaNs in a vector
    zeros_array[no_data] = np.nan # where there are NaNs in a vector: assign NaN as "group number"
    cluster_array = zeros_array.reshape(1000,1000)

    threshold = np.zeros(3*k).reshape(k,3)

    for k_index in range(1,k+1):
        if kmeans_input == 5:
            values = pixelplotting_array[2, cluster_array == k_index]
        else:
            values = pixelplotting_array[:, cluster_array == k_index]
        threshold_min = np.nanmin(values.T)
        threshold_avg = np.nanmean(values.T)
        threshold_max = np.nanmax(values.T)
        threshold[k_index-1] = np.array([threshold_min, threshold_avg, threshold_max])

    return cluster_array, number_sqdist, silhouette, calinski_harabasz, threshold


# In[17]:


def find_k(method="all", dpi=None, savepng=False):

    figsize=(12,12)
    fontsize = figsize[0]

    fig = plt.figure(figsize=figsize)
    ax1 = fig.add_subplot(111)

    x = []
    y = []

    if method == "elbow": # elbow method
        for k in kmeansDic.keys():
            x.append(int(k))
            y.append(np.array(kmeansDic[str(k)]["ssqd"]))
            plt.scatter(x,y)
            string1 = "Elbow score"
            string2 = "Sum of squared distances to respective centroid"
            string3 = "elbow" # only for printing PNGs

    if method == "silhouette": # silhouette score := (b-a) divided by max(a,b) ; where a is the intra-cluster distance and b is the nearest-cluster distance
        for k in kmeansDic.keys():
            x.append(int(k))
            y.append(np.array(kmeansDic[str(k)]["silhouette"]))
            plt.scatter(x,y)
            string1 = "Silhouette score"
            string2 = "(b-a)/max(a,b)"
            string3 = "silhouette" # only for printing PNGs

    if method == "calinski_harabasz": # Calinski-Harabasz score := within-cluster disperson divided by between-cluster dispersion
        for k in kmeansDic.keys():
            x.append(int(k))
            y.append(np.array(kmeansDic[str(k)]["calinski_harabasz"]))
            plt.scatter(x,y)
            string1 = "Calinski-Harabasz score"
            string2 = "ratio between within-cluster and between-cluster disperson"
            string3 = "calinski_harabasz" # only for printing PNGs

    if method == "all": # compare all scores next to each other

        try:
            for k in kmeansDic.keys():
                x.append(int(k))
                y.append(np.array(kmeansDic[str(k)]["ssqd"]))
            y_normalized = []
            for entry in y:
                y_normalized.append(entry/np.max(y))
            print("elbow:", str([round(i,2) for i in y_normalized]))
            print("elbow (original):", str([round(float(i),2) for i in y]))
            elbow_scatter = ax1.scatter(x,y_normalized, c="red")
        except:
            print("No value for elbow in this kmeans dictionary.") # not actually needed, because there are NaN values from the kmeans_clustering() function

        try:
            x.clear() # don't take over
            y.clear() # values from ssqd!
            for k in kmeansDic.keys():
                x.append(int(k))
                y.append(np.array(kmeansDic[str(k)]["silhouette"]))
            y_normalized = []
            for entry in y:
                y_normalized.append(entry/np.max(y))
            print("silhouette:", str([round(i,2) for i in y_normalized]))
            print("silhouette (original):", str([round(float(i),2) for i in y]))
            silhouette_scatter = ax1.scatter(x,y_normalized, c="green")
        except:
            print("No value for silhouette in this kmeans dictionary.")

        try:
            x.clear() # don't take over
            y.clear() # values from silhouette!
            for k in kmeansDic.keys():
                x.append(int(k))
                y.append(np.array(kmeansDic[str(k)]["calinski_harabasz"]))
            y_normalized = []
            for entry in y:
                y_normalized.append(entry/np.max(y))
            print("calinski_harabasz:", str([round(i,2) for i in y_normalized]))
            print("calinski_harabasz (original):", str([round(float(i),2) for i in y]))
            calinski_harabasz_scatter = ax1.scatter(x,y_normalized, c="blue")
        except:
            print("No value for calinski-harabasz in this kmeans dictionary.")

        string1 = "All scores in comparison"
        string2 = "Normalized scores"

        plt.legend((elbow_scatter, silhouette_scatter, calinski_harabasz_scatter), ("Elbow Score", "Silhouette score", "Calinski-Harabasz score"), scatterpoints=1, loc='lower left', ncol=3, fontsize=fontsize)

    ax1.set_xlabel("Chosen k for clustering", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    ax1.set_ylabel(string2, labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    plt.title(string1, pad = 15, fontsize=round(9/5*fontsize), fontweight="bold")
    ax1.tick_params(labelsize=9/5*fontsize, length=fontsize)
    ax1.set_xticks(x)

    plt.show()

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/find_k_" + string3 + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/find_k_" + string3 + dt_string + ".png", dpi=dpi, bbox_inches="tight")


# In[18]:


def bar_chart(input1=1, input2=2,input2_random=False):

    figsize = (20,10)
    fontsize = figsize[0]
    fig, ax = plt.subplots(figsize=figsize)

    groups1 = np.array(load_cache_kmeans(kmeans_input = input1)["2"]["cluster_numbers"])
    if input2_random:
        groups2 = np.random.randint(2,size=(1000,1000))+1
    else:
        groups2 = np.array(load_cache_kmeans(kmeans_input = input2)["2"]["cluster_numbers"])
    OneOne = 0
    OneTwo = 0
    TwoOne = 0
    TwoTwo = 0
    for z in np.arange(0,1000,1):
        for y in np.arange(0,1000,1):
            if groups1[y][z] == 1 and groups2[y][z] == 1:
                OneOne += 1
            if groups1[y][z] == 1 and groups2[y][z] == 2:
                OneTwo += 1
            if groups1[y][z] == 2 and groups2[y][z] == 1:
                TwoOne += 1
            if groups1[y][z] == 2 and groups2[y][z] == 2:
                TwoTwo += 1
    labels = ["1 & 1", "2 & 2", "1 & 2", "2 & 1"]
    comparisons = (OneOne,TwoTwo,OneTwo,TwoOne)
    index = np.arange(4)
    bar_width = 0.9
    plt.bar(index, comparisons, bar_width,  color="green")
    plt.xticks(index, labels) # labels get centered
    for p in ax.patches: # add height label above bins
        width = p.get_width()
        height = p.get_height()
        x, y = p.get_xy()
        ax.annotate(height, (x + width/2, y + height*1.02), ha='center', fontsize=fontsize)

    ax.set_xlabel("Compared groups", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    ax.set_ylabel("Counts", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    ax.tick_params(labelsize=fontsize, length=4/5*fontsize)
    plt.title("Bar Chart", pad = 15, fontsize=round(9/5*fontsize), fontweight="bold")

    plt.show()
#bar_chart(input1=1, input2=4)


# In[47]:


corr_coefficient(kmeans_inputs=[1,2])


# In[453]:


#arr1 = np.array(load_cache_currents()["pixelplotting_array"][0])
arr1 = np.array(load_cache_kmeans(kmeans_input = 7, data_processing=1)["2"]["cluster_numbers"])
arr2 = np.array(load_cache_kmeans(kmeans_input = 1, data_processing=1)["2"]["cluster_numbers"])
#arr2 = plotMedianCorrelation(return_only=True)[1]

corr_coefficient(compare_any=(arr1,arr2))


# In[446]:


#arr1 = np.array(load_cache_currents()["pixelplotting_array"][0])
arr1 = np.array(load_cache_kmeans(kmeans_input = 1, data_processing=1)["2"]["cluster_numbers"])
arr2 = plotMedianCorrelation(return_only=True)[1]

corr_coefficient(compare_any=(arr1,arr2))


# In[19]:


def corr_coefficient(currents=None, kmeans_inputs=None, compare_any=None, random=False):
    """Given two grids of same shape, this function determines the correlation coefficient between both grids.

    ARGUMENTS:
        grid1 (2D-Array): Grid of data, e. g. of shape 1000 x 1000
        grid2 (2D-Array): Grid of data, e. g. of shape 1000 x 1000

    RETURNS:
        correlation (float): correlation coefficient. 1 means positively correlated, -1 means negatively correlated
    """

    if isinstance(currents, (list, tuple)):
        scan_order = [185,183,181,182,186]
        grid1 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(currents[0])])
        if random:
            grid2 = np.random.uniform(np.nanmin(grid1),np.nanmax(grid1),size=(np.shape(grid1)))
        else:
            grid2 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(currents[1])])

    if isinstance(kmeans_inputs, (list, tuple)):
        grid1 = np.array(load_cache_kmeans(kmeans_input = kmeans_inputs[0])["2"]["cluster_numbers"])
        if random:
            grid2 = np.random.randint(2,size=(np.shape(grid1)))+1
        else:
            grid2 = np.array(load_cache_kmeans(kmeans_input = kmeans_inputs[1])["2"]["cluster_numbers"])

    if isinstance(compare_any, (list, tuple)):
        grid1 = compare_any[0]
        grid2 = compare_any[1]

    grid1 = grid1.flatten()      # flatten both arrays to simplify calculations
    grid2 = grid2.flatten()

    no_nan_ind1 = ~ np.isnan(grid1)  # delete all NaN-values from both grids
    grid1 = grid1[no_nan_ind1]
    grid2 = grid2[no_nan_ind1]

    no_nan_ind2 = ~ np.isnan(grid2)
    grid1 = grid1[no_nan_ind2]
    grid2 = grid2[no_nan_ind2]

    if np.size(grid1) == np.size(grid2):
        pass
    else:
        raise Exception("Both given arrays must have same size and shape.")

    mean1 = np.mean(grid1)       # compute the mean value
    mean2 = np.mean(grid2)
    stdev1 = np.std(grid1)       # compute the standard deviation of the values
    stdev2 = np.std(grid2)

    N = np.size(grid1)

    term1 = 0
    for i in range(N):
        term1 += (((grid1[i] - mean1) / stdev1) * ((grid2[i] - mean2) / stdev2))

    correlation = 1 / (N - 1) * term1
    return correlation


# In[608]:


scan_order = [185,183,181,182,186]
current_array1 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(185)]).ravel()[np.newaxis,:].T
current_array1 = current_array1 - np.nanmin(current_array1)
current_array2 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(183)]).ravel()[np.newaxis,:].T
current_array2 = current_array2 - np.nanmin(current_array2)
current_array3 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(181)]).ravel()[np.newaxis,:].T
current_array3 = current_array3 - np.nanmin(current_array3)
current_array4 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(182)]).ravel()[np.newaxis,:].T
current_array4 = current_array4 - np.nanmin(current_array4)
current_array5 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(186)]).ravel()[np.newaxis,:].T
current_array5 = current_array5 - np.nanmin(current_array5)
numpy_data = np.concatenate((current_array1,current_array2,current_array3,current_array4,current_array5),axis=1)
print(np.shape(current_array1))
print(current_array5[~np.isnan(current_array1)])
for i in range(1,20):
    print(numpy_data[15032*i])


# In[40]:


pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array"])[4].ravel()
print(len(pixelplotting_array[~np.isnan(pixelplotting_array)]))


# In[53]:


violin(currents=True, data_processing=3, savepng=True)


# In[61]:


plotXBIC2(singleplots=[181])


# In[96]:


scan_order = [185,183,181,182,186]


current_array1 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(185)])
current_array2 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(183)])
current_array3 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(181)])
current_array4 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(182)])
current_array5 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(186)])

x = [-200,-100,0,100,200]
y = [np.nanstd(current_array1),np.nanstd(current_array2),np.nanstd(current_array3),np.nanstd(current_array4),np.nanstd(current_array5)]
plt.plot(x,y)


# In[97]:


current_array1 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(185)])
current_array2 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(183)])
current_array3 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(181)])
current_array4 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(182)])
current_array5 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(186)])

x = [-200,-100,0,100,200]
y = [np.nanstd(current_array1),np.nanstd(current_array2),np.nanstd(current_array3),np.nanstd(current_array4),np.nanstd(current_array5)]
plt.plot(x,y)


# In[98]:


violin(currents=True, data_processing=3, title=False, savepng=True)


# In[20]:


def violin(kmeans_input=7,currents=False, title=True, data_processing=1, dpi=None, savepng=False):
    start = time.time()

    figsize = (30*0.75,15*0.75)
    fontsize = figsize[0]
    fig, ax = plt.subplots(figsize=figsize)

    scan_order = [185,183,181,182,186]

    xlabels = ["-200","-100","0","+100","+200"]

    if currents:
        if data_processing == 1:
            current_array1 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(185)]).ravel()[np.newaxis,:].T
            current_array2 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(183)]).ravel()[np.newaxis,:].T
            current_array3 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(181)]).ravel()[np.newaxis,:].T
            current_array4 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(182)]).ravel()[np.newaxis,:].T
            current_array5 = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(186)]).ravel()[np.newaxis,:].T
        if data_processing == 3:
            current_array1 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(185)]).ravel()[np.newaxis,:].T
            current_array1 = current_array1 - np.nanmin(current_array1)
            current_array2 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(183)]).ravel()[np.newaxis,:].T
            current_array2 = current_array2 - np.nanmin(current_array2)
            current_array3 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(181)]).ravel()[np.newaxis,:].T
            current_array3 = current_array3 - np.nanmin(current_array3)
            current_array4 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(182)]).ravel()[np.newaxis,:].T
            current_array4 = current_array4 - np.nanmin(current_array4)
            current_array5 = np.array(load_cache_currents()["pixelplotting_array_direct"][scan_order.index(186)]).ravel()[np.newaxis,:].T
            current_array5 = current_array5 - np.nanmin(current_array5)
        numpy_data = np.concatenate((current_array1,current_array2,current_array3,current_array4,current_array5),axis=1)
        df = pd.DataFrame(data=numpy_data, columns=xlabels)
        ax = sn.violinplot(data=df, scale="count",inner="box")

    else:
        numpy_dataDic = {}
        for scan_index in scan_order:
            current_array_scan = np.array(load_cache_currents()["pixelplotting_array"][scan_order.index(scan_index)])
            data = current_array_scan.ravel()[np.newaxis,:].T
            scan = np.repeat(scan_index,len(data))[np.newaxis,:].T
            group = np.array(load_cache_kmeans(kmeans_input = kmeans_input)["2"]["cluster_numbers"]).ravel()[np.newaxis,:].T
            numpy_dataDic[scan_index] = np.concatenate((data,scan,group),axis=1)
            #plt.axvline(x=scan_index-181.5, color="grey", lw=1)
        numpy_data = np.concatenate((numpy_dataDic[185],numpy_dataDic[183],numpy_dataDic[181],numpy_dataDic[182],numpy_dataDic[186]))
        df = pd.DataFrame(data=numpy_data, columns=["XBIC","Bias voltage (mV)","Group"])
        df = df.replace(1.0,"Group 1")
        df = df.replace(2.0,"Group 2")
        ax = sn.violinplot(x = "Bias voltage (mV)", y = "XBIC", hue = "Group", data = df, order=[185,183,181,182,186], palette = "viridis", split=True, linewidth = 0, scale = "count", inner = "box", ax = ax)

    ax.set_xlabel("Bias voltage (mV)", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    if data_processing == 1:
        ax.set_ylabel("XBIC", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    if data_processing == 3:
        ax.set_ylabel("Current - Current$\mathregular{_{min}}$", labelpad = 10, fontsize=9/5*fontsize, fontweight="bold")
    ax.tick_params(labelsize=fontsize, length=4/5*fontsize)

    ax.xaxis.set_ticks([0,1,2,3,4])
    ax.xaxis.set_ticklabels(xlabels, fontsize=fontsize)

    # add minor ticks for XBIC
    if data_processing == 1:
        ax.yaxis.set_minor_locator(MultipleLocator(10))
    if data_processing == 3:
        ax.yaxis.set_minor_locator(MultipleLocator(500))
    ax.tick_params(which='minor', length=2/5*fontsize)
    plt.grid(axis="y", which="minor",lw=0.25)

    plt.grid(axis="y")
    if title:
        plt.title("Current distribution", pad = 15, fontsize=round(9/5*fontsize), fontweight="bold")

    # add vertical lines
    for x in range(5):
        plt.axvline(x=x+0.5, color="grey", lw=1)

#     meta = {
#     "colour": {"a": "red", "b": "blue"},
#     "label": {"a": "this a", "b": "this b"},
#     }

#     handles = [
#         Patch(
#             facecolor=meta["colour"][x],
#             label=meta["label"][x],
#         )
#         for x in meta["colour"].keys()
#     ]

#     fig.legend(
#         handles=handles,
#         ncol=2,
#         loc="upper right",
#         bbox_to_anchor=(0.5, 0.5),
#         fontsize=10,
#         handlelength=1,
#         handleheight=1,
#         frameon=False,
#     )

    ax.legend(prop={'size': fontsize})
    #ax.get_legend().remove()

    plt.show()

    end = time.time()
    print(f"Plotting of the violin plots took {str(round(end-start,2))} seconds.")

    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/violin_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/violin_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")


# In[434]:


plotMedianCorrelation()


# In[133]:


for i in [2,3,4]:
    currents = np.asarray(load_cache_currents()["pixelplotting_array"])  # Shape: (5, 1000, 1000)
    medians = np.nanmedian(currents, axis=(1, 2), keepdims=True)  # Shape: (5,)
    nanmaxs = np.nanmax(currents, axis=(1, 2), keepdims=True)
    nanmins = np.nanmin(currents, axis=(1, 2), keepdims=True)

    results = np.full_like(currents, np.nan)  # Shape: (5, 1000, 1000)

    results[currents > medians] = 1.0
    results[currents < medians] = 2.0

    #results[currents > medians and currents < (nanmaxs - 10)] = 1.0
    #results[currents < medians and currents < (nanmins + 10)] = 2.0

    counter_good_keeper = 0
    counter_bad_keeper = 0
    counter_not_keeper = 0

    mask = np.empty((1000,1000),dtype=bool)

    counter = 0
    not_NaN_counter = 0

    for z in np.linspace(0,999,1000,dtype=int):
        for y in np.linspace(0,999,1000,dtype=int):
            yz_list = []
            for i in range(5):
                yz_list.append(results[i][y,z])
            if all(x==yz_list[0] for x in yz_list) and yz_list[0] == 1.0:
                counter_good_keeper += 1
                mask[y,z] = True
            if all(x==yz_list[0] for x in yz_list) and yz_list[0] == 2.0:
                counter_bad_keeper += 1
                mask[y,z] = True
            if not all(x==yz_list[0] for x in yz_list):
                counter_not_keeper += 1
                mask[y,z] = False

            if not np.isnan(np.sum(yz_list)):
                not_NaN_counter += 1


            counter += 1
            if counter in [10000,100000,400000,900000,999999]:
                print(counter)

    print("------")
    print(counter)
    print(not_NaN_counter)
    print(counter_not_keeper)
    print(counter_good_keeper)
    print(counter_bad_keeper)

    not_keepers = np.empty((1000,1000))

    not_keepers[mask] = 1.0
    not_keepers[~mask] = 2.0
    pixelplotting_array = np.array(load_cache_currents()["pixelplotting_array"])
    not_keepers[np.isnan(pixelplotting_array).any(axis=0)] = np.float("NaN")

    plotClusterPosition(plot_any=results[i], title="" ,no_suptitle=True, savepng=True)


# In[161]:


def max_interval(precision=0.1, interval_size=20, scan_no=0, return_only=False):

    currents = np.array(load_cache_currents()["pixelplotting_array"])[scan_no].ravel()
    c_min = np.nanmin(currents)
    c_max = np.nanmax(currents)

    counts = np.empty((1,2))
    counter = 0
    for A in np.arange(c_min,c_max,precision):
        count = np.size(np.where(np.logical_and(currents>A,currents<A+interval_size)))
        if counter == 0:
            counts[0,:] = np.array([[A, count]])
        else:
            counts = np.concatenate((counts, np.array([[A, count]])), axis=0)
        counter +=1

    max_counts = counts[np.where(counts == np.max(counts[:,1]))[0]]
    if return_only:
        return max_counts

    print(max_interval)


# In[165]:


max_interval()


# In[164]:


max_size = 100

x = np.arange(1,max_size)
y = []
y2= []
y3= []
y4= []
y5= []

for i in range(1,max_size):
    max_counts = max_interval(precision=1,interval_size=i, scan_no=0, return_only=True)[0,0]
    y.append(max_counts)

for i in range(1,max_size):
    max_counts = max_interval(precision=1,interval_size=i, scan_no=1, return_only=True)[0,0]
    y2.append(max_counts)

for i in range(1,max_size):
    max_counts = max_interval(precision=1,interval_size=i, scan_no=2, return_only=True)[0,0]
    y3.append(max_counts)

for i in range(1,max_size):
    max_counts = max_interval(precision=1,interval_size=i, scan_no=3, return_only=True)[0,0]
    y4.append(max_counts)

for i in range(1,max_size):
    max_counts = max_interval(precision=1,interval_size=i, scan_no=4, return_only=True)[0,0]
    y5.append(max_counts)

plt.plot(x,y)
plt.plot(x,y2)
plt.plot(x,y3)
plt.plot(x,y4)
plt.plot(x,y5)


# In[21]:


def plotMedianCorrelation(return_only=False, dpi=None, savepng=False):

    currents = np.asarray(load_cache_currents()["pixelplotting_array"])  # Shape: (5, 1000, 1000)
    medians = np.nanmedian(currents, axis=(1, 2), keepdims=True)  # Shape: (5,)

    results = np.full_like(currents, np.nan)  # Shape: (5, 1000, 1000)

    results[currents > medians] = 1.0
    results[currents < medians] = 2.0
    results[currents == medians] = np.float("NaN")

    if return_only:
        return results

    figsize=(15,15)
    fontsize = figsize[0]//2
    fig = plt.figure(figsize=figsize)
    ax1 = fig.add_subplot(111)

    x = [1,2,3,4]
    correlations = []

    for i in range(4):
        arr1 = results[i]
        arr2 = results[i+1]
        correlation = (corr_coefficient(compare_any=[arr1,arr2]))
        correlations.append(correlation)
    print(correlations)

    ax1.plot(x,correlations,"-o",markersize=15, linewidth=1.2)

    # y and z labels
    ax1.set_xlabel("Compared scans", labelpad = 15, fontsize=3*fontsize, fontweight="bold")
    ax1.set_ylabel("Correlation coefficient", labelpad = 15, fontsize=3*fontsize, fontweight="bold")
    ax1.set_title("Median group correlations", pad = 15, fontsize=round(3*fontsize), fontweight="bold")
    ax1.tick_params(labelsize=15/5*fontsize, length=7/5*fontsize)

    ax1.xaxis.set_ticks([1,2,3,4])
    ax1.xaxis.set_ticklabels((["-200 to -100mV","-100 to 0mV","0 to +100mV","+100 to +200mV"]))

    ax1.yaxis.set_minor_locator(MultipleLocator(5))
    ax1.tick_params(which='minor', length=4/5*fontsize)
    plt.grid(axis="y", which="minor",lw=0.25)
    plt.grid(axis="y")


    if savepng:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        if dpi is None:
            fig.savefig("savefig/MedianCorrelation_" + dt_string + ".png", dpi=fig.dpi, bbox_inches="tight")
        else:
            fig.savefig("savefig/MedianCorrelation_" + dt_string + ".png", dpi=dpi, bbox_inches="tight")


# In[21]:


for i in [7,7.1,7.2,7.3]:
    plotClusterCurves(kmeans_input=i)


# In[496]:


for i in [7,7.1,7.2,7.3]:
    plotClusterPosition(kmeans_input=i)


# In[431]:


for i in range(1,10):

    if i in [8,8.1,9]:
        kmeansDic = load_cache_kmeans(kmeans_input=i, k_highest=10, silhouette_setting=False)
    else:
        kmeansDic = load_cache_kmeans(kmeans_input=i, k_highest=10, silhouette_setting=True)
    find_k()


# In[614]:


for i in [1,2,3.1,4,5,6,7,8,9]:

    plotClusterPosition(data_processing=3, kmeans_input=i)
    #plotClusterPosition(kmeans_input=i)


# In[82]:


SAVE = False

# code for saving all pictures:

# save all figures for different kmeans inputs

for k_index in [2]:
    #for kmeans_input_index in [1,2,3.1,4,5,6,7]:
    for kmeans_input_index in [1,2,3.1,4,5,6,7,8,9]:
        for normalize_samples_index in [False]:
            for normalize_curves_index in [False]:
                # load the respective cached dictionary for the definition of "kmeansDic" throughout this project

                silhouette_setting = False # True should be selected for complete data. If False is selected, then silhouette will just be assigned to 0 and omitted, saving a lot of computation time.
                sample_size = 300000 # this number refers to the sample size in the silhouette_score() function which is computationally very expensive
                k_lowest = 2
                k_highest = 3
                kmeans_input = kmeans_input_index
                data_processing = 1
                normalize_samples = normalize_samples_index

                try:

                    if silhouette_setting:
                        silhouette_str = f"_{sample_size}-samplesize"
                    else:
                        silhouette_str = ""

                except:
                    continue

                if not normalize_samples:
                    normalize_samples_str = "feature normalized"
                else:
                    normalize_samples_str = "feature & sample normalized"
                suptitle = f"k = {k_index}, kmeans-input = {kmeans_input}, " + normalize_samples_str
                if normalize_curves_index:
                    savepng_str = f"{k_index}-k_{kmeans_input_index}-kmeans-input_{normalize_samples_index}-normalize-samples_True-normalize-curves"
                else:
                    savepng_str = f"{k_index}-k_{kmeans_input_index}-kmeans-input_{normalize_samples_index}-normalize-samples"

                plotClusterPosition(k=k_index, kmeans_input=kmeans_input_index, title="", no_suptitle=True, suptitle="", dpi=200, savepng=SAVE, savepng_str=savepng_str)
                #plotClusterCurves(k=k_index,compare=True,k_curves=1, kmeans_input=kmeans_input_index,normalize_curves=normalize_curves_index, suptitle="", title=False, dpi=200, savepng=SAVE, savepng_str=savepng_str)


# <font size="3"><b>Run the cells at the bottom to save variables in working memory to speed up calculations that would otherwise include computationally expensive lines of code

# In[195]:


# To speed up all calculations remarkably: Create a dictionary with all the warp matrices:
# Run each time you start working this notebook to save the values in the global namespace.
start = time.time()
CurrentsDic = {
    "pixelplotting_array": plotIV(return_only=True)[0],
    "currents_mean": plotIV(return_only=True)[1],
    "pixelplotting_array_direct": plotIV(data_processing=3, return_only=True)[0]
}

end = time.time()
print("Loading all current arrays took", round((end-start), 2), "seconds.")

   ### SAVE CACHE ###
# with these settings (copied from http://www.silx.org/doc/silx/0.2.0/modules/io/dictdump.html), the file size is much smaller, but loading takes much longer, so better not use it!
SAVE = False

if SAVE:

    compression = False

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan") # just make sure that we are talking about the correct location in the file system

    if compression:
        create_ds_args = {'compression': "gzip",
                      'shuffle': True,
                      'fletcher32': True}
    else:
        create_ds_args = None

    saveh5 = input("Do you want to save the dictionary as a h5 file that other functions will refer to ? (y/n)\n")
    if saveh5.lower() in ["y", "yes"]:
        if not compression:
            fname = "CurrentsDic.h5"
        else:
            fname = "CurrentsDic_COMPRESSED.h5"
        dicttoh5(CurrentsDic, "Cache/"+ fname, create_dataset_args=create_ds_args)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
    if saveh5.lower() in ["n", "no"]:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        fname = "CurrentsDic_no-save_" + dt_string + ".h5"
        dicttoh5(CurrentsDic, "Cache/" + fname)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")


# In[24]:


# To speed up all calculations remarkably: Create a dictionary with all the warp matrices:
# Run each time you start working this notebook to save the values in the global namespace.
start = time.time()
ImageRegistrationDic = {
    "179": ImageRegistration(178,179),
    "180": ImageRegistration(178,180),
    "182": ImageRegistration(181,182),
    "183": ImageRegistration(181,183),
    "185": ImageRegistration(181,185),
    "186": ImageRegistration(181,186),
    "191": ImageRegistration(181,191),
    "192": ImageRegistration(181,192)
}
end = time.time()
print("Calculating all warp matrices took", round((end-start), 2), "seconds.")

   ### SAVE CACHE ###
# with these settings (copied from http://www.silx.org/doc/silx/0.2.0/modules/io/dictdump.html), the file size is much smaller, but loading takes much longer, so better not use it!
SAVE = False

if SAVE:

    compression = False

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan") # just make sure that we are talking about the correct location in the file system

    if compression:
        create_ds_args = {'compression': "gzip",
                      'shuffle': True,
                      'fletcher32': True}
    else:
        create_ds_args = None

    saveh5 = input("Do you want to save the dictionary as a h5 file that other functions will refer to ? (y/n)\n")
    if saveh5.lower() in ["y", "yes"]:
        if not compression:
            fname = "ImageRegistrationDic.h5"
        else:
            fname = "ImageRegistrationDic_COMPRESSED.h5"
        dicttoh5(ImageRegistrationDic, "Cache/"+ fname, create_dataset_args=create_ds_args)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
    if saveh5.lower() in ["n", "no"]:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        fname = "ImageRegistrationDic_no-save_" + dt_string + ".h5"
        dicttoh5(ImageRegistrationDic, "Cache/" + fname)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")


# In[588]:


for i in [1,2,3.1,4,5,6,7,8,9]:
    # Create a nested dictionary for all cluster arrays, sum of squared distances, silhouette and calinski_harabasz scores

    start = time.time()

    kmeansDic = {}

    silhouette_setting = False # True should be selected for complete data. If False is selected, then silhouette will just be assigned to 0 and omitted, saving a lot of computation time.
    sample_size = 300000 # this number refers to the sample size in the silhouette_score() function which is computationally very expensive
    k_lowest = 2
    k_highest = 3
    kmeans_input = i
    data_processing = 3
    normalize_samples = False

    for k in range(k_lowest, k_highest+1):
        start_k = time.time()
        cluster_numbers, ssqd, silhouette, calinski_harabasz, threshold = kmeans_clustering(data_processing=data_processing, kmeans_input=kmeans_input, normalize_samples=normalize_samples, k=k, sample_size=sample_size, silhouette=silhouette_setting)
        kmeansDic[str(k)] = {"cluster_numbers": cluster_numbers, "ssqd": ssqd, "silhouette": silhouette, "calinski_harabasz": calinski_harabasz, "threshold": threshold}
        end = time.time()
        print("Finished entry in nested dictionary for k = " + str(k) + ". That took " + str(round((end-start_k), 2)) + " seconds.")

    end = time.time()
    print("Calculating all cluster arrays took", round((end-start), 2), "seconds.")

                                            ### SAVE CACHE ###
    # with these settings (copied from http://www.silx.org/doc/silx/0.2.0/modules/io/dictdump.html), the file size is much smaller, but loading takes much longer, so better not use it!

    SAVE = False

    if SAVE:

        compression = False

        os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan") # just make sure that we are talking about the correct location in the file system

        if compression:
            create_ds_args = {'compression': "gzip",
                          'shuffle': True,
                          'fletcher32': True}
        else:
            create_ds_args = None

        saveh5 = "y"
        if saveh5.lower() in ["y", "yes"]:
            if silhouette_setting:
                silhouette_str = f"_{sample_size}-samplesize"
            else:
                silhouette_str = ""
            if not compression:
                fname = f"kmeansDic({k_lowest},{k_highest})_{kmeans_input}-kmeans-input_{normalize_samples!s}-normalize-samples_{data_processing}-data-processing_{silhouette_setting!s}-silhouette{silhouette_str}.h5"
            else:
                fname = f"kmeansDic({k_lowest},{k_highest})_{kmeans_input}-kmeans-input_{normalize_samples!s}-normalize-samples_{data_processing}-data-processing_{silhouette_setting!s}-silhouette{silhouette_str}_COMPRESSED.h5"

            dicttoh5(kmeansDic, "Cache/"+ fname, create_dataset_args=create_ds_args)
            print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
        if saveh5.lower() in ["n", "no"]:
            now = datetime.now()
            dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
            fname = "kmeansDic_no-save_" + dt_string + ".h5"
            dicttoh5(kmeansDic, "Cache/" + fname)
            print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")


# In[435]:


# Create a nested dictionary for all cluster arrays, sum of squared distances, silhouette and calinski_harabasz scores

start = time.time()

kmeansDic = {}

silhouette_setting = False # True should be selected for complete data. If False is selected, then silhouette will just be assigned to 0 and omitted, saving a lot of computation time.
sample_size = 300000 # this number refers to the sample size in the silhouette_score() function which is computationally very expensive
k_lowest = 2
k_highest = 3
kmeans_input = 1.1
data_processing = 1
normalize_samples = False

for k in range(k_lowest, k_highest+1):
    start_k = time.time()
    cluster_numbers, ssqd, silhouette, calinski_harabasz, threshold = kmeans_clustering(data_processing=data_processing, kmeans_input=kmeans_input, normalize_samples=normalize_samples, k=k, sample_size=sample_size, silhouette=silhouette_setting)
    kmeansDic[str(k)] = {"cluster_numbers": cluster_numbers, "ssqd": ssqd, "silhouette": silhouette, "calinski_harabasz": calinski_harabasz, "threshold": threshold}
    end = time.time()
    print("Finished entry in nested dictionary for k = " + str(k) + ". That took " + str(round((end-start_k), 2)) + " seconds.")

end = time.time()
print("Calculating all cluster arrays took", round((end-start), 2), "seconds.")

                                        ### SAVE CACHE ###
# with these settings (copied from http://www.silx.org/doc/silx/0.2.0/modules/io/dictdump.html), the file size is much smaller, but loading takes much longer, so better not use it!

SAVE = False

if SAVE:

    compression = False

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan") # just make sure that we are talking about the correct location in the file system

    if compression:
        create_ds_args = {'compression': "gzip",
                      'shuffle': True,
                      'fletcher32': True}
    else:
        create_ds_args = None

    saveh5 = input("Do you want to save the dictionary as a h5 file that other functions will refer to ? (y/n)\n")
    if saveh5.lower() in ["y", "yes"]:
        if silhouette_setting:
            silhouette_str = f"_{sample_size}-samplesize"
        else:
            silhouette_str = ""
        if not compression:
            fname = f"kmeansDic({k_lowest},{k_highest})_{kmeans_input}-kmeans-input_{normalize_samples!s}-normalize-samples_{data_processing}-data-processing_{silhouette_setting!s}-silhouette{silhouette_str}.h5"
        else:
            fname = f"kmeansDic({k_lowest},{k_highest})_{kmeans_input}-kmeans-input_{normalize_samples!s}-normalize-samples_{data_processing}-data-processing_{silhouette_setting!s}-silhouette{silhouette_str}_COMPRESSED.h5"

        dicttoh5(kmeansDic, "Cache/"+ fname, create_dataset_args=create_ds_args)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")
    if saveh5.lower() in ["n", "no"]:
        now = datetime.now()
        dt_string = now.strftime("%d-%m-%Y_%H_%M_%S")
        fname = "kmeansDic_no-save_" + dt_string + ".h5"
        dicttoh5(kmeansDic, "Cache/" + fname)
        print("The nested dictionary was saved as \"" + fname + "\" in the Cache folder.")


# In[22]:


def load_cache_currents():

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan")

    fpath = "/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan/Cache/CurrentsDic.h5"
    CurrentsDic = h5py.File(fpath, "r")

    return CurrentsDic

CurrentsDic = load_cache_currents()


# In[23]:


# load the respective cached dictionary for the definition of "kmeansDic" throughout this project
def load_cache_IR():

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan")

    fpath = "/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan/Cache/ImageRegistrationDic.h5"
    ImageRegistrationDic = h5py.File(fpath, "r")

    return ImageRegistrationDic

ImageRegistrationDic = load_cache_IR()


# In[24]:


# load the respective cached dictionary for the definition of "kmeansDic" throughout this project
def load_cache_kmeans(silhouette_setting = False, sample_size = 300000, k_lowest = 2, k_highest = 3, kmeans_input = 1, data_processing = 1, normalize_samples = False):

    os.chdir("/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan")

    if silhouette_setting:
        silhouette_str = f"_{sample_size}-samplesize"
    else:
        silhouette_str = ""

    fpath = "/asap3/petra3/gpfs/p06/2018/data/11005475/scratch_cc/Jan/Cache/"+ f"kmeansDic({k_lowest},{k_highest})_{kmeans_input}-kmeans-input_{normalize_samples!s}-normalize-samples_{data_processing}-data-processing_{silhouette_setting!s}-silhouette{silhouette_str}.h5"

    kmeansDic = h5py.File(fpath, "r")

    return kmeansDic

kmeansDic = load_cache_kmeans()