data processing tkt c'est nickel

2024-03-13 14:13:51 +01:00
parent 66c6e6c931
commit 87e1af2ea2
6 changed files with 477 additions and 145 deletions
--- a/data/data_exploration.ipynb
+++ b/data/data_exploration.ipynb
--- a/data/data_loading.py
+++ b/data/data_loading.py
@@ -1,3 +1,5 @@
 import os
 import numpy as np
 import pandas as pd
 from utils.df_utils import *
@@ -39,7 +41,7 @@ def get_raw_measure_with_metadata(file: pd.Series) -> tuple[pd.DataFrame, list[p
    return metadata, sliced_experiments
-def load_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
+def load_raw_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
    """
    Load all the available data, slice it into individual measures and give the corresponding metadata
    :return: dataframe containing the metadata with one row per sliced measure
@@ -52,3 +54,18 @@ def load_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
        metadata = pd.concat([metadata, temp_metadata])
        sliced_experiments.extend(temp_sliced_experiments)
    return metadata, sliced_experiments
 def data_to_single_df(data: list[pd.DataFrame]) -> pd.DataFrame:
    """
    Converts a list of dataframes into a long dataframe. Loses lots of info, to use with care!
    :param data: list of dataframes of same length
    :return:
    """
    return pd.DataFrame(map(lambda x: np.append(x.index.to_numpy(), x["#Intensity"].to_numpy()), data))
 def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
    metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
    experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"))
    return metadata, experiments
--- a/data/data_processing.ipynb
+++ b/data/data_processing.ipynb
--- a/data/data_processing.py
+++ b/data/data_processing.py
@@ -1,88 +1,53 @@
 from pybaselines import Baseline
 import numpy as np
 from math import factorial
 from sklearn.preprocessing import StandardScaler
 from scipy.signal import savgol_filter
-def calculate_baseline(measure):
+def calculate_baseline(measure, lam=10, p=1e-2):
-    baseline_fitter = Baseline(x_data=measure.index)
+    baseline_fitter = Baseline(x_data=measure[:len(measure)//2])
-    bkg_2, params_2 = baseline_fitter.iasls(measure["#Intensity"], lam=10, p=1e-2)
+    bkg_2, params_2 = baseline_fitter.iasls(measure[len(measure)//2:], lam=lam, p=p)
    return bkg_2
-def adjust_baseline(measure, scale = False):
+def adjust_baseline(measure, lam=10, p=1e-2):
-    baseline = calculate_baseline(measure)
+    baseline = calculate_baseline(measure, lam=lam, p=p)
-    measure["#Intensity"] -= baseline
+    return measure[len(measure)//2:] - baseline
    if scale:
        measure["#Intensity"] /= baseline.max() - baseline.min()
    return measure
-def savitzky_golay(y, window_size, order, deriv=0, rate=1):
+def adjust_all_baselines(measures, lam=10, p=1e-2):
-    r"""Smooth (and optionally differentiate) data with a Savitzky-Golay filter.
+    result = measures.copy(deep=True)
-    The Savitzky-Golay filter removes high frequency noise from data.
+    for index, row in result.iterrows():
-    It has the advantage of preserving the original shape and
+        result.iloc[index, len(row)//2:] = adjust_baseline(row, lam=lam, p=p)
-    features of the signal better than other types of filtering
+    return result
    approaches, such as moving averages techniques.
    Parameters
    ----------
    y : array_like, shape (N,)
        the values of the time history of the signal.
    window_size : int
        the length of the window. Must be an odd integer number.
    order : int
        the order of the polynomial used in the filtering.
        Must be less then `window_size` - 1.
    deriv: int
        the order of the derivative to compute (default = 0 means only smoothing)
    Returns
    -------
    ys : ndarray, shape (N)
        the smoothed signal (or it's n-th derivative).
    Notes
    -----
    The Savitzky-Golay is a type of low-pass filter, particularly
    suited for smoothing noisy data. The main idea behind this
    approach is to make for each point a least-square fit with a
    polynomial of high order over a odd-sized window centered at
    the point.
    Examples
    --------
    t = np.linspace(-4, 4, 500)
    y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape)
    ysg = savitzky_golay(y, window_size=31, order=4)
    import matplotlib.pyplot as plt
    plt.plot(t, y, label='Noisy signal')
    plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal')
    plt.plot(t, ysg, 'r', label='Filtered signal')
    plt.legend()
    plt.show()
    References
    ----------
    .. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of
       Data by Simplified Least Squares Procedures. Analytical
       Chemistry, 1964, 36 (8), pp 1627-1639.
    .. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing
       W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
       Cambridge University Press ISBN-13: 9780521880688
    """
-    try:
+
-        window_size = np.abs(int(window_size))
+def scale_experiments(experiments):
-        order = np.abs(int(order))
+    result = experiments.copy(deep=True)
-    except ValueError:
+    trans = StandardScaler()
-        raise ValueError("window_size and order have to be of type int")
+    scaled = trans.fit_transform(experiments.transpose()[len(experiments.columns)//2:]).T
-    if window_size % 2 != 1 or window_size < 1:
+    result.iloc[:, len(result.columns)//2:] = scaled
-        raise TypeError("window_size size must be a positive odd number")
+    return result
-    if window_size < order + 2:
+
-        raise TypeError("window_size is too small for the polynomials order")
+
-    order_range = range(order+1)
+def apply_smoothing(experiment, window_length=7, polyorder=3):
-    half_window = (window_size -1) // 2
+    return savgol_filter(experiment[len(experiment)//2:], window_length, polyorder)
-    # precompute coefficients
+
-    b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
+
-    m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
+def smooth_experiments(experiments, window_length=7, polyorder=3):
-    # pad the signal at the extremes with
+    result = experiments.copy(deep=True)
-    # values taken from the signal itself
+    for index, row in result.iterrows():
-    firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
+        result.iloc[index, len(row) // 2:] = apply_smoothing(row, window_length=window_length, polyorder=polyorder)
-    lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
+    return result
-    y = np.concatenate((firstvals, y, lastvals))
+
-    return np.convolve(m[::-1], y.T[0], mode='valid')
+
 def process_experiments(experiments, scale_features=True):
    baselined_experiments = adjust_all_baselines(experiments)
    scaled_experiments = scale_experiments(baselined_experiments)
    smoothed_experiments = smooth_experiments(scaled_experiments)
    if scale_features:
        trans = StandardScaler()
        return trans.fit_transform(smoothed_experiments)
    else:
        return smoothed_experiments
--- a/data_raw/metadata.csv
+++ b/data_raw/metadata.csv
@@ -14,16 +14,16 @@ CHA0SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh
 CHA0SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2.txt,CHA0,3.0,liquid,50,alu,100,1800,20,True,2
 CHA0SampleSpectraSolid1_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2 (2).txt,CHA0,1.0,solid,50,alu,100,1800,20,True,2
 CHA0SampleSpectraSolid1_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2.txt,CHA0,1.0,solid,50,alu,100,1800,20,True,2
-CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu (2).txt,CHA0,,soli,50,alu,100,1800,20,True,2
+CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu (2).txt,CHA0,,solid,50,alu,100,1800,20,True,2
-CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu(1).txt,CHA0,,soli,50,alu,100,1800,20,True,2
+CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu(1).txt,CHA0,,solid,50,alu,100,1800,20,True,2
-CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,CHA0,,soli,50,alu,100,1800,20,True,2
+CHA0SampleSpectraSolid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,CHA0,,solid,50,alu,100,1800,20,True,2
 CHA0SampleSpectra_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,CHA0,,,50,alu,100,1800,20,True,2
 F113SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu (2).txt,F113,3.0,liquid,50,alu,100,1800,20,True,2
 F113SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,F113,3.0,liquid,50,alu,100,1800,20,True,2
 F113SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2_otherdrop1 (2).txt,F113,3.0,liquid,50,alu,100,1800,20,True,2
 F113SampleSpectraLiquid3_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2_otherdrop1.txt,F113,3.0,liquid,50,alu,100,1800,20,True,2
-F113SampleSpectraLiquid_50x_dried3_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2_otherdrop.txt,F113,,liqui,50,alu,100,1800,20,True,2
+F113SampleSpectraLiquid_50x_dried3_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2_otherdrop.txt,F113,,liquid,50,alu,100,1800,20,True,2
-F113SampleSpectraLiquid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,F113,,liqui,50,alu,100,1800,20,True,2
+F113SampleSpectraLiquid_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,F113,,liquid,50,alu,100,1800,20,True,2
 F113SampleSpectraSolid2_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu.txt,F113,2.0,solid,50,alu,100,1800,20,True,2
 F113SampleSpectraSolid2_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2 (2).txt,F113,2.0,solid,50,alu,100,1800,20,True,2
 F113SampleSpectraSolid2_50x_dried_drop_alu_100percent_1800gr_20sec_confocalhigh_2accu_round2.txt,F113,2.0,solid,50,alu,100,1800,20,True,2
--- a/data_raw/metadata_aggregator.py
+++ b/data_raw/metadata_aggregator.py
@@ -6,7 +6,8 @@ df = pd.DataFrame([i for i in os.listdir() if i.endswith(".txt")], columns=["fil
 df["strain"] = df.apply(lambda x: x.file[:4].upper(), axis=1)
 df["replica"] = df.apply(lambda x: x.file.split("_")[0][-1], axis=1)
 df["replica"] = df.apply(lambda x: int(x.replica) if x.replica.isnumeric() else None, axis=1)
-df["phase"] = df.apply(lambda x: x.file.split("_")[0].replace("SampleSpectra", "")[4:-1 if x.replica is not None else None].casefold(), axis=1)
+df["phase"] = df.apply(lambda x: x.file.split("_")[0].replace("SampleSpectra", "")[4:].casefold(), axis=1)
 df["phase"] = df.apply(lambda x: x.phase if pd.isnull(x.replica) else x.phase[:-1], axis=1)
 df["objective"] = df.apply(lambda x: int(x.file.split("_")[1].replace("x", "")), axis=1)
 df["substrate"] = df.apply(lambda x: x.file.split("_")[4].casefold(), axis=1)
 df["laser_power[%]"] = df.apply(lambda x: int(x.file.split("_")[5].replace("percent", "")), axis=1)