Better data processing
This commit is contained in:
37
classifiers/random_forest.ipynb
Normal file
37
classifiers/random_forest.ipynb
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "initial_id",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
""
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -1,3 +1,2 @@
|
|||||||
import pandas as pd
|
|
||||||
from data_loading import *
|
from data_loading import *
|
||||||
from data_processing import *
|
from data_processing import *
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,7 +1,6 @@
|
|||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from utils.df_utils import *
|
import os
|
||||||
|
from utils.df_utils import slice_df
|
||||||
|
|
||||||
|
|
||||||
def load_raw_metadata() -> pd.DataFrame:
|
def load_raw_metadata() -> pd.DataFrame:
|
||||||
@@ -56,15 +55,6 @@ def load_raw_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
|
|||||||
return metadata, sliced_experiments
|
return metadata, sliced_experiments
|
||||||
|
|
||||||
|
|
||||||
def data_to_single_df(data: list[pd.DataFrame]) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Converts a list of dataframes into a long dataframe. Loses lots of info, to use with care!
|
|
||||||
:param data: list of dataframes of same length
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
return pd.DataFrame(map(lambda x: np.append(x.index.to_numpy(), x["#Intensity"].to_numpy()), data))
|
|
||||||
|
|
||||||
|
|
||||||
def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
|
def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
|
metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
|
||||||
experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"))
|
experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"))
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -1,53 +1,88 @@
|
|||||||
from pybaselines import Baseline
|
from pybaselines import Baseline
|
||||||
import numpy as np
|
|
||||||
from math import factorial
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from scipy.signal import savgol_filter
|
from scipy.signal import savgol_filter
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
def calculate_baseline(measure, lam=10, p=1e-2):
|
def interpolate_spectrum(exp, max_wave=1927, min_wave=181):
|
||||||
baseline_fitter = Baseline(x_data=measure[:len(measure)//2])
|
"""
|
||||||
bkg_2, params_2 = baseline_fitter.iasls(measure[len(measure)//2:], lam=lam, p=p)
|
Interpolate spectra at integer wavelengths in specified wavelength range.
|
||||||
|
:param exp: one measure
|
||||||
|
:param max_wave: maximal wavelength
|
||||||
|
:param min_wave: minimal wavelength
|
||||||
|
:return: the interpolated experiment
|
||||||
|
"""
|
||||||
|
result = pd.DataFrame(columns=["#Intensity"])
|
||||||
|
for i in range(min_wave, max_wave, 1):
|
||||||
|
for k in range(len(exp.index)-1):
|
||||||
|
if exp.index[k] == i:
|
||||||
|
result.loc[i] = exp["#Intensity"].iloc[k]
|
||||||
|
break
|
||||||
|
if exp.index[k] > i > exp.index[k + 1]:
|
||||||
|
result.loc[i] = exp["#Intensity"].iloc[k] - (
|
||||||
|
exp["#Intensity"].iloc[k] - exp["#Intensity"].iloc[k + 1]) / (
|
||||||
|
exp.index[k] - exp.index[k + 1]) * (exp.index[k] - i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
result.loc[i] = 0
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def interpolate_experiments(sliced_experiments, max_wave=1927, min_wave=181):
|
||||||
|
result = []
|
||||||
|
for i, exp in enumerate(sliced_experiments):
|
||||||
|
result.append(interpolate_spectrum(exp, max_wave, min_wave))
|
||||||
|
print(i+1, "/", len(sliced_experiments))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_baseline(x_data, measure, lam=10, p=1e-2):
|
||||||
|
baseline_fitter = Baseline(x_data=x_data)
|
||||||
|
bkg_2, params_2 = baseline_fitter.iasls(measure, lam=lam, p=p)
|
||||||
return bkg_2
|
return bkg_2
|
||||||
|
|
||||||
|
|
||||||
def adjust_baseline(measure, lam=10, p=1e-2):
|
def adjust_baseline(x_data, measure, lam=10, p=1e-2):
|
||||||
baseline = calculate_baseline(measure, lam=lam, p=p)
|
baseline = calculate_baseline(x_data, measure, lam=lam, p=p)
|
||||||
return measure[len(measure)//2:] - baseline
|
return measure - baseline
|
||||||
|
|
||||||
|
|
||||||
def adjust_all_baselines(measures, lam=10, p=1e-2):
|
def adjust_all_baselines(measures, lam=10, p=1e-2):
|
||||||
result = measures.copy(deep=True)
|
result = measures.copy(deep=True)
|
||||||
for index, row in result.iterrows():
|
for index, row in result.iterrows():
|
||||||
result.iloc[index, len(row)//2:] = adjust_baseline(row, lam=lam, p=p)
|
result.iloc[index] = adjust_baseline(measures.columns.astype(int), row, lam=lam, p=p)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def scale_experiments(experiments):
|
def scale_experiments(experiments):
|
||||||
result = experiments.copy(deep=True)
|
result = experiments.copy(deep=True)
|
||||||
trans = StandardScaler()
|
trans = StandardScaler()
|
||||||
scaled = trans.fit_transform(experiments.transpose()[len(experiments.columns)//2:]).T
|
scaled = trans.fit_transform(experiments.transpose()).T
|
||||||
result.iloc[:, len(result.columns)//2:] = scaled
|
result.iloc[:, :] = scaled
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def apply_smoothing(experiment, window_length=7, polyorder=3):
|
def apply_smoothing(experiment, window_length=7, polyorder=3):
|
||||||
return savgol_filter(experiment[len(experiment)//2:], window_length, polyorder)
|
return savgol_filter(experiment, window_length, polyorder)
|
||||||
|
|
||||||
|
|
||||||
def smooth_experiments(experiments, window_length=7, polyorder=3):
|
def smooth_experiments(experiments, window_length=7, polyorder=3):
|
||||||
result = experiments.copy(deep=True)
|
result = experiments.copy(deep=True)
|
||||||
for index, row in result.iterrows():
|
for index, row in result.iterrows():
|
||||||
result.iloc[index, len(row) // 2:] = apply_smoothing(row, window_length=window_length, polyorder=polyorder)
|
result.iloc[index, :] = apply_smoothing(row, window_length=window_length, polyorder=polyorder)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def process_experiments(experiments, scale_features=True):
|
def categorize_metadata(metadata: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
baselined_experiments = adjust_all_baselines(experiments)
|
truth = pd.DataFrame(pd.Categorical(metadata["strain"]).codes, index=metadata.index)
|
||||||
scaled_experiments = scale_experiments(baselined_experiments)
|
encoded = pd.get_dummies(data = metadata, columns = ["phase", "substrate", "confocalhigh"], dtype=int)
|
||||||
smoothed_experiments = smooth_experiments(scaled_experiments)
|
encoded.drop(columns=["replica", "strain"], inplace=True)
|
||||||
if scale_features:
|
return truth, encoded
|
||||||
trans = StandardScaler()
|
|
||||||
return trans.fit_transform(smoothed_experiments)
|
|
||||||
else:
|
def process_experiments(experiments: pd.DataFrame, baseline_lam=10, baseline_p=1e-2,
|
||||||
return smoothed_experiments
|
smooth_window_length=7, smooth_polyorder=3) -> pd.DataFrame:
|
||||||
|
experiments = adjust_all_baselines(experiments, lam=baseline_lam, p=baseline_p)
|
||||||
|
experiments = scale_experiments(experiments)
|
||||||
|
experiments = smooth_experiments(experiments, window_length=smooth_window_length, polyorder=smooth_polyorder)
|
||||||
|
return experiments
|
||||||
|
|||||||
@@ -6,3 +6,4 @@ def slice_df(df, slice_positions):
|
|||||||
sliced_dfs.append(df[slice_positions[i]-1:slice_positions[i+1]-1].copy(deep=True))
|
sliced_dfs.append(df[slice_positions[i]-1:slice_positions[i+1]-1].copy(deep=True))
|
||||||
sliced_dfs.append(df[slice_positions[-1]-1:].copy(deep=True))
|
sliced_dfs.append(df[slice_positions[-1]-1:].copy(deep=True))
|
||||||
return sliced_dfs
|
return sliced_dfs
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user