data processing tkt c'est nickel
This commit is contained in:
364
data/data_exploration.ipynb
Normal file
364
data/data_exploration.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -1,3 +1,5 @@
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from utils.df_utils import *
|
||||
|
||||
@@ -39,7 +41,7 @@ def get_raw_measure_with_metadata(file: pd.Series) -> tuple[pd.DataFrame, list[p
|
||||
return metadata, sliced_experiments
|
||||
|
||||
|
||||
def load_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
|
||||
def load_raw_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
|
||||
"""
|
||||
Load all the available data, slice it into individual measures and give the corresponding metadata
|
||||
:return: dataframe containing the metadata with one row per sliced measure
|
||||
@@ -52,3 +54,18 @@ def load_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
|
||||
metadata = pd.concat([metadata, temp_metadata])
|
||||
sliced_experiments.extend(temp_sliced_experiments)
|
||||
return metadata, sliced_experiments
|
||||
|
||||
|
||||
def data_to_single_df(data: list[pd.DataFrame]) -> pd.DataFrame:
|
||||
"""
|
||||
Converts a list of dataframes into a long dataframe. Loses lots of info, to use with care!
|
||||
:param data: list of dataframes of same length
|
||||
:return:
|
||||
"""
|
||||
return pd.DataFrame(map(lambda x: np.append(x.index.to_numpy(), x["#Intensity"].to_numpy()), data))
|
||||
|
||||
|
||||
def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
|
||||
experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"))
|
||||
return metadata, experiments
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,88 +1,53 @@
|
||||
from pybaselines import Baseline
|
||||
import numpy as np
|
||||
from math import factorial
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from scipy.signal import savgol_filter
|
||||
|
||||
|
||||
def calculate_baseline(measure):
|
||||
baseline_fitter = Baseline(x_data=measure.index)
|
||||
bkg_2, params_2 = baseline_fitter.iasls(measure["#Intensity"], lam=10, p=1e-2)
|
||||
def calculate_baseline(measure, lam=10, p=1e-2):
|
||||
baseline_fitter = Baseline(x_data=measure[:len(measure)//2])
|
||||
bkg_2, params_2 = baseline_fitter.iasls(measure[len(measure)//2:], lam=lam, p=p)
|
||||
return bkg_2
|
||||
|
||||
|
||||
def adjust_baseline(measure, scale = False):
|
||||
baseline = calculate_baseline(measure)
|
||||
measure["#Intensity"] -= baseline
|
||||
if scale:
|
||||
measure["#Intensity"] /= baseline.max() - baseline.min()
|
||||
return measure
|
||||
def adjust_baseline(measure, lam=10, p=1e-2):
|
||||
baseline = calculate_baseline(measure, lam=lam, p=p)
|
||||
return measure[len(measure)//2:] - baseline
|
||||
|
||||
|
||||
def savitzky_golay(y, window_size, order, deriv=0, rate=1):
|
||||
r"""Smooth (and optionally differentiate) data with a Savitzky-Golay filter.
|
||||
The Savitzky-Golay filter removes high frequency noise from data.
|
||||
It has the advantage of preserving the original shape and
|
||||
features of the signal better than other types of filtering
|
||||
approaches, such as moving averages techniques.
|
||||
Parameters
|
||||
----------
|
||||
y : array_like, shape (N,)
|
||||
the values of the time history of the signal.
|
||||
window_size : int
|
||||
the length of the window. Must be an odd integer number.
|
||||
order : int
|
||||
the order of the polynomial used in the filtering.
|
||||
Must be less then `window_size` - 1.
|
||||
deriv: int
|
||||
the order of the derivative to compute (default = 0 means only smoothing)
|
||||
Returns
|
||||
-------
|
||||
ys : ndarray, shape (N)
|
||||
the smoothed signal (or it's n-th derivative).
|
||||
Notes
|
||||
-----
|
||||
The Savitzky-Golay is a type of low-pass filter, particularly
|
||||
suited for smoothing noisy data. The main idea behind this
|
||||
approach is to make for each point a least-square fit with a
|
||||
polynomial of high order over a odd-sized window centered at
|
||||
the point.
|
||||
Examples
|
||||
--------
|
||||
t = np.linspace(-4, 4, 500)
|
||||
y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape)
|
||||
ysg = savitzky_golay(y, window_size=31, order=4)
|
||||
import matplotlib.pyplot as plt
|
||||
plt.plot(t, y, label='Noisy signal')
|
||||
plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal')
|
||||
plt.plot(t, ysg, 'r', label='Filtered signal')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
References
|
||||
----------
|
||||
.. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of
|
||||
Data by Simplified Least Squares Procedures. Analytical
|
||||
Chemistry, 1964, 36 (8), pp 1627-1639.
|
||||
.. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing
|
||||
W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
|
||||
Cambridge University Press ISBN-13: 9780521880688
|
||||
"""
|
||||
def adjust_all_baselines(measures, lam=10, p=1e-2):
|
||||
result = measures.copy(deep=True)
|
||||
for index, row in result.iterrows():
|
||||
result.iloc[index, len(row)//2:] = adjust_baseline(row, lam=lam, p=p)
|
||||
return result
|
||||
|
||||
try:
|
||||
window_size = np.abs(int(window_size))
|
||||
order = np.abs(int(order))
|
||||
except ValueError:
|
||||
raise ValueError("window_size and order have to be of type int")
|
||||
if window_size % 2 != 1 or window_size < 1:
|
||||
raise TypeError("window_size size must be a positive odd number")
|
||||
if window_size < order + 2:
|
||||
raise TypeError("window_size is too small for the polynomials order")
|
||||
order_range = range(order+1)
|
||||
half_window = (window_size -1) // 2
|
||||
# precompute coefficients
|
||||
b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
|
||||
m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
|
||||
# pad the signal at the extremes with
|
||||
# values taken from the signal itself
|
||||
firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
|
||||
lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
|
||||
y = np.concatenate((firstvals, y, lastvals))
|
||||
return np.convolve(m[::-1], y.T[0], mode='valid')
|
||||
|
||||
def scale_experiments(experiments):
|
||||
result = experiments.copy(deep=True)
|
||||
trans = StandardScaler()
|
||||
scaled = trans.fit_transform(experiments.transpose()[len(experiments.columns)//2:]).T
|
||||
result.iloc[:, len(result.columns)//2:] = scaled
|
||||
return result
|
||||
|
||||
|
||||
def apply_smoothing(experiment, window_length=7, polyorder=3):
|
||||
return savgol_filter(experiment[len(experiment)//2:], window_length, polyorder)
|
||||
|
||||
|
||||
def smooth_experiments(experiments, window_length=7, polyorder=3):
|
||||
result = experiments.copy(deep=True)
|
||||
for index, row in result.iterrows():
|
||||
result.iloc[index, len(row) // 2:] = apply_smoothing(row, window_length=window_length, polyorder=polyorder)
|
||||
return result
|
||||
|
||||
|
||||
def process_experiments(experiments, scale_features=True):
|
||||
baselined_experiments = adjust_all_baselines(experiments)
|
||||
scaled_experiments = scale_experiments(baselined_experiments)
|
||||
smoothed_experiments = smooth_experiments(scaled_experiments)
|
||||
if scale_features:
|
||||
trans = StandardScaler()
|
||||
return trans.fit_transform(smoothed_experiments)
|
||||
else:
|
||||
return smoothed_experiments
|
||||
|
||||
Reference in New Issue
Block a user