Dump

2025-02-27 13:46:56 +01:00
parent 84caa01612
commit 29936cb347
16 changed files with 3720 additions and 0 deletions
--- a/microwave/init.py
+++ b/microwave/init.py
@@ -0,0 +1,3 @@
+from . import data_analysis
+from . import utils
+from . import math
--- a/microwave/data_analysis/init.py
+++ b/microwave/data_analysis/init.py
@@ -0,0 +1,2 @@
+from . import ppscore
+from . import univariate
--- a/microwave/data_analysis/ppscore/init.py
+++ b/microwave/data_analysis/ppscore/init.py
@@ -0,0 +1 @@
+from .ppscore import *
--- a/microwave/data_analysis/ppscore/ppscore.py
+++ b/microwave/data_analysis/ppscore/ppscore.py
@@ -0,0 +1,276 @@
+from typing import Union, Callable, Optional, Any
+from sklearn.metrics import make_scorer
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.base import is_classifier, is_regressor
+from sklearn.model_selection import cross_val_score
+from joblib import Parallel, delayed
+from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
+import pandas as pd
+import numpy as np
+
+
+def _identify_case(model) -> str:
+    """
+    Identifies if the given model is a classifier or regressor.
+    Args:
+        model: Must be sklearn-compatible and either a regressor of classifier.
+    Returns:
+        "classification" or "regression"
+    Raises:
+        ValueError: If the model cannot be determined to be either a classifier or a regressor
+    """
+    if is_classifier(model):
+        return "classification"
+    elif is_regressor(model):
+        return "regression"
+    else:
+        raise ValueError("The model cannot be determined to be either a classifier or a regressor")
+    
+
+def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
+    """
+    Calculates the expected metric result of a naive model against y.
+    Args:
+        y: shape[n,1]; True values
+        case: "classification" or "regression"
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+    Returns:
+        A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
+    Raises:
+        Nothing
+    """
+    y = _sample(y, ~nan_rows_mask(y))
+    if case == "regression":
+        base = np.full_like(y, np.median(y))
+    elif case == "classification":
+        values, counts = np.unique(y, return_counts=True)
+        ind = np.argmax(counts)
+        base = np.full_like(y, values[ind])
+    return metric(y, base)
+
+
+def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
+    """
+    Calculates the base information depending on the model, metric and true values.
+    Args:
+        x: shape[n, m]; Predictors (features).
+        y: shape[n, 1]; True values (targets).
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+        model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
+    Returns:
+        A DataFrame containing the pps informations, including:
+            - ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
+            - case: The type of model ("classifier" or "regressor").
+            - metric: The name of the metric used.
+            - perfect_score: The score when the model's predictions are perfect.
+            - naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
+            - model_score: Placeholder for the model score, initialized to NaN.
+            - model: The type of the model.
+    Raises:
+        Nothing
+    """
+    y = _sample(y, ~nan_rows_mask(y))
+    case = _identify_case(model)
+    baseline_score = _get_baseline_score(y, case, metric)
+    perfect_score = metric(y, y)
+    return pd.DataFrame({
+        "ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
+        "case": case,
+        "metric": metric.__name__,
+        "perfect_score": perfect_score,
+        "naive_score": baseline_score,
+        "model_score": np.nan,
+        "model": type(model).__name__
+    })
+
+
+def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
+    """
+    Returns the score according to the given metric of a fitted model
+    Args:
+        x: shape[n, m]; Predictors (features).
+        y: shape[n,1]; True values (targets).
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+        model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
+        metric_params (optional): Additional parameters to pass to the metric function.
+        crossvals (optional) [default=5]: Number of cross-validations to perform.
+    Returns:
+        The evaluation metric on the prediction of the model
+    Raises:
+        Nothing
+    """
+    if model is None:
+        model = DecisionTreeRegressor()
+    nan_mask = nan_rows_mask(x, y)
+    scores = cross_val_score(
+        model, 
+        np.array(_sample(x, ~nan_mask)).reshape(-1, 1), 
+        np.array(_sample(y, ~nan_mask)).reshape(-1, 1), 
+        cv=crossvals, 
+        scoring=make_scorer(metric, **metric_params)
+    )
+    return scores.mean()
+
+
+def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series], 
+                  naive_score: Union[int, float, np.ndarray, pd.Series], 
+                  perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
+    """
+    Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
+    Args:
+        score: The actual score(s).
+        naive_score: The naive score(s).
+        perfect_score: The perfect score(s).
+    Returns:
+        The predictive power score(s).
+    """
+    score = np.asarray(score)
+    naive_score = np.asarray(naive_score)
+    perfect_score = np.asarray(perfect_score)
+    pps = (score - naive_score) / (perfect_score - naive_score)
+    pps = np.where(pps <= 0, 0, pps)
+    if isinstance(score, pd.Series):
+        return pd.Series(pps, index=score.index)
+    return pps
+
+
+def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
+    """
+    Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
+    Args:
+        x: shape[n,1] or [n,]; Predictors (features).
+        y: shape[n,1] or [n,]; True values (targets).
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
+        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
+        sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
+        shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
+        crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
+    Returns:
+        The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
+    Raises:
+        Nothing
+    """
+    _verify_tabular_data_shape(x, y, is_column=True)
+    x, y = _to_series(x), _to_series(y)
+    x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
+    if model is None:
+        model = DecisionTreeRegressor()
+    res_df = _prepare_df(x, y, metric, model)
+    res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
+    res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
+    return res_df
+
+
+def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
+    """
+    Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
+    Args:
+        df: shape[n, m]; Predictors (features).
+        y: shape[n, 1]; True values (targets).
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
+        crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
+        njobs (optional) [default=1]: Number of jobs to run in parallel.
+        sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
+        sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
+        shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
+    Returns:
+        A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+    Raises:
+        Nothing
+    """
+    _verify_tabular_data_shape(df, is_column=False)
+    _verify_tabular_data_shape(y, is_column=True)
+    y = _to_series(y)
+    df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
+    if model is None:
+        model = DecisionTreeRegressor()
+    res_df = _prepare_df(df, y, metric, model)
+    res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
+    res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
+    res_df.insert(0, "x", df.columns)
+    if sort:
+        res_df = res_df.sort_values("ppscore", ascending=False)
+    return res_df.reset_index(drop=True)
+
+
+def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
+    """
+    Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
+    Args:
+        df: Input DataFrame containing predictors and the target column.
+        col: Target column name.
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+        model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
+    Returns:
+        A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+    
+    Raises:
+        Nothing
+    """
+    y = df[[col]]
+    df_pred = df.loc[:, df.columns != col]
+    res = predictors(df_pred, y, metric, model=model, **kwargs)
+    res.insert(1, "y", col)
+    return res
+
+
+def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
+    """
+    Calculates the predictive power score (pps) of every column in df against every other column in df using a given model. 
+    Scores will be baselined between 0 and 1 depending on the nature of the problem.
+    Args:
+        df: shape[n, m]
+        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
+        crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
+        njobs (optional) [default=1]: Number of jobs to run in parallel.
+        sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
+        sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
+        shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
+    Returns:
+        A DataFrame containing the pps of each predictor in df against every target column. 
+        The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+    Raises:
+        Nothing
+    """
+    _verify_tabular_data_shape(df, is_column=False)
+    df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
+    if model is None:
+        model = DecisionTreeRegressor()
+    res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
+    res_df = pd.concat(res, axis=0)
+    if sort:
+        res_df = res_df.sort_values("ppscore", ascending=False)
+    return res_df.reset_index(drop=True)
+
+
+def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
+    """
+    Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
+    For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations. 
+    Continues until no considered pps is over the threshold.
+    Args:
+        matrix: A DataFrame containing the pps of each feature against every other.
+        threshold (optional) [default=0.9]
+    Returns:
+        A list of features that are the most predicted by others
+    Raises:
+        Nothing
+    """
+    features = matrix["x"].unique()
+    cols_predict_count = dict(zip(features, [0]*len(features)))
+    pred_mut = []
+    while True:
+        for _, predict in matrix.iterrows():
+            if predict.y in cols_predict_count and predict.ppscore > threshold:
+                cols_predict_count[predict.y] += predict.ppscore
+        if sum(cols_predict_count.values()) == 0:
+            break
+        else:
+            best_predictor = max(cols_predict_count, key=cols_predict_count.get)
+            pred_mut.append(best_predictor)
+            del cols_predict_count[best_predictor]
+            matrix = matrix[matrix["x"] != best_predictor]
+            matrix = matrix[matrix["y"] != best_predictor]
+    return pred_mut
--- a/microwave/data_analysis/univariate/init.py
+++ b/microwave/data_analysis/univariate/init.py
@@ -0,0 +1 @@
+from .aggregates import *
--- a/microwave/data_analysis/univariate/aggregates.py
+++ b/microwave/data_analysis/univariate/aggregates.py
@@ -0,0 +1,236 @@
+from typing import Any, Union, Optional, Callable
+from joblib import Parallel, delayed
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+
+
+def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
+    """
+    Calculates the mean within a certain percentile range
+    Args:
+        x: The considered ndarray.
+        N1: Lower percentile (between 0 and 1)
+        N2: Upper percentile (between 0 and 1)
+    Returns:
+        The IP-mean
+    Raises:
+        Nothing
+    """
+    p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
+    return np.mean(x[(x >= p1) & (x <= p2)])
+
+
+def median_absolute_deviation(x: np.ndarray) -> float:
+    """
+    Calculates the median of the deviations from the median
+    Args:
+        x: The considered ndarray.
+    Returns:
+        The MAD
+    Raises:
+        Nothing
+    """
+    return np.nanmedian(np.abs(x - np.nanmedian(x)))
+
+
+def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
+    """
+    Calculates the range within a certain percentile range
+    Args:
+        x: The considered ndarray.
+        N1: Lower percentile (between 0 and 1)
+        N2: Upper percentile (between 0 and 1)
+    Returns:
+        The IP-range
+    Raises:
+        Nothing
+    """
+    return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
+
+
+def mode(x: np.ndarray) -> Any:
+    """
+    Calculates the mode of numeric and categorical variables
+    Args:
+        x: The considered ndarray.
+    Returns:
+        The mode
+    Raises:
+        Nothing
+    """
+    if pd.api.types.is_numeric_dtype(x):
+        return stats.mode(x, nan_policy='omit').mode
+    else:
+        return pd.Series(x).mode().iat[0]
+    
+
+def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
+    """
+    https://xkcd.com/2435/
+    Args:
+        x: The considered ndarray.
+        iter: 
+    Returns:
+        The geothmetic meandian
+    Raises:
+        Nothing
+    """
+    if iter == 0:
+        return x[0]
+    return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
+
+
+def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
+    """
+    Calculates the amount of outliers with the zscore method.
+    Args:
+        x: The considered ndarray.
+        n_sig: Number of standard deviations before being considered an outlier
+    Returns:
+        The number of outliers
+    Raises:
+        Nothing
+    """
+    return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
+
+
+AGGFUNCCODES = {
+    # Counts
+    "size": len,
+    "non-null": lambda x: len(x) - pd.isna(x).sum(),
+    "nunique": lambda x: pd.Series(x).nunique(dropna=True),
+
+    # Basic
+    "sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
+    "min": np.nanmin,
+    "max": np.nanmax,
+    "first": lambda x: x[~pd.isna(x)][0],
+    "last": lambda x: x[~pd.isna(x)][-1],
+
+    # Centricity
+    "mean": np.nanmean,
+    "median": np.nanmedian,
+    "mode": mode,
+    "gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
+    "hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
+    "Pmean": inter_percentile_mean,
+    "geothmetic meandian": geothmetic_meandian,
+
+    # Dispersion
+    "variance": np.nanvar,
+    "std": np.nanstd,
+    "mad": median_absolute_deviation,
+    "skewness": lambda x: stats.skew(x, nan_policy='omit'),
+    "excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
+    "range": lambda x: np.nanmax(x) - np.nanmin(x),
+    "Prange": inter_percentile_range,
+    "n_outliers": get_n_outliers,
+
+    # Percentiles
+    "P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
+    "P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
+    "P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
+    "P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
+    "PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
+
+    # Distribution
+    "skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
+    "kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
+    "normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
+    "jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
+    "shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
+    "anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
+
+    # Other
+    "energy": lambda x: np.nansum(x**2),
+    "rms": lambda x: np.sqrt(np.nanmean(x**2)),
+    "entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
+    "autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
+}
+
+
+def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
+    """
+    Executes a given aggregation function on a given data.
+    If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
+    Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
+    Args:
+        x: The data to execute the function on.
+        agg: The function to execute.
+    Returns:
+        Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
+    Raises:
+        Nothing
+    """
+    ret_names = None
+    if callable(agg):
+        ret = agg(x)
+    elif isinstance(agg, str):
+        ret = AGGFUNCCODES[agg](x)
+    elif isinstance(agg, dict):
+        kwargs = {} if 'kwargs' not in agg else agg['kwargs']
+        if callable(agg['func']):
+            ret = agg['func'](x, **kwargs)
+        elif isinstance(agg['func'], str):
+            ret = AGGFUNCCODES[agg['func']](x, **kwargs)
+        if 'ret_names' in agg:
+            ret_names = agg['ret_names']
+    if isinstance(ret, tuple):
+        if ret_names is None:
+            ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
+        return dict(zip(ret_names, ret))
+    else:
+        return ret
+
+
+def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
+    """
+    Executes a given list of aggregation functions on a given data.
+    Args:
+        x: The data to execute the functions on.
+        aggs: The functions to execute.
+    Returns:
+        A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
+    Raises:
+        Nothing
+    """
+    results = {}
+    for i, func in enumerate(aggs):
+        funcname = f"func_{i}"
+        try:
+            ret = execute_agg_func(x, func)
+        except Exception as e:
+            print(f"agg_{i}", func, e)
+            ret = np.nan
+        if isinstance(func, dict):
+            if 'name' in func:
+                funcname = func['name']
+            elif isinstance(func['func'], str):
+                funcname = func['func']
+        elif isinstance(func, str):
+            funcname = func if func not in results.keys() else f"{func}_{i}"
+        if isinstance(ret, dict):
+            results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
+        else:
+            results[funcname] = ret
+    return results
+
+
+def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
+    """
+    Calculates specified univariate statistics for each column in the DataFrame.
+    Args:
+        df: The input DataFrame.
+        agg: List of aggregation functions to apply.
+            Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
+        n_jobs: number of parallel processes to open. -1 means as many as possible.
+    Returns:
+        pd.DataFrame: DataFrame with one row per specified aggregation.
+    Raises:
+        Nothing
+    """
+    if agg == "all":
+        agg = list(AGGFUNCCODES.keys())
+    results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
+    return pd.DataFrame(results, index=df.columns)
--- a/microwave/data_processing/init.py
+++ b/microwave/data_processing/init.py
@@ -0,0 +1,2 @@
+from .df_preprocessing import *
+from .dfTransformer import dfTransformer
--- a/microwave/data_processing/dfTransformer.py
+++ b/microwave/data_processing/dfTransformer.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from typing import Optional
+
+
+class dfTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        self.transforms = []
+
+    def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
+        """
+        Adds a transform specific to a column with optional result column names.
+
+        Args:
+            column_name (str): The name of the column to transform. For dfTransformer, use None.
+            transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
+            result_columns (list of str, optional): List of names for the resulting columns. Default is None.
+        """
+        if not hasattr(transformer, 'transform'):
+            raise ValueError("The transformer must have a 'transform' method.")
+        self.transforms.append((column_name, transformer, result_columns))
+
+    def fit(self, X: pd.DataFrame, y=None):
+        """
+        Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
+
+        Args:
+            X (pd.DataFrame): The DataFrame to fit.
+            y: Ignored.
+
+        Returns:
+            self: Fitted transformer.
+        """
+        for column_name, transformer, _ in self.transforms:
+            if isinstance(transformer, dfTransformer):
+                transformer.fit(X, y)
+            elif column_name in X:
+                transformer.fit(X[[column_name]], y)  # Fit the transformer on the specific column
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies all stored transformations to the DataFrame, drops the original columns,
+        and returns the transformed DataFrame.
+
+        Args:
+            X (pd.DataFrame): The DataFrame to transform.
+
+        Returns:
+            pd.DataFrame: The transformed DataFrame.
+        """
+        X_transformed = X.copy()
+
+        for column_name, transformer, result_columns in self.transforms:
+            if isinstance(transformer, dfTransformer):
+                X_transformed = transformer.transform(X_transformed)
+            if column_name in X_transformed:
+                transformed_data = transformer.transform(X_transformed[[column_name]])
+                
+                # Check if the transformed data is a DataFrame; if not, convert it
+                if isinstance(transformed_data, pd.DataFrame):
+                    transformed_cols = transformed_data
+                else:
+                    transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
+                    if result_columns:
+                        transformed_cols.columns = result_columns
+                    else:
+                        transformed_cols.columns = [
+                            f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
+                        ]
+
+                X_transformed.drop(columns=[column_name], inplace=True)
+                X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
+
+        return X_transformed
--- a/microwave/data_processing/df_preprocessing.py
+++ b/microwave/data_processing/df_preprocessing.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from .dfTransformer import dfTransformer
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
+
+
+def _get_encoder(encoding):
+    if encoding == 'onehot':
+        return OneHotEncoder(sparse_output=False)
+    elif encoding == 'label':
+        return LabelEncoder()
+    elif encoding == 'ordinal':
+        return OrdinalEncoder()
+    else:
+        raise ValueError("Unsupported encoding type.")
+    
+
+def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
+    if len(ret_shape) == 1:
+        ret_shape.append(1)
+    if hasattr(encoder, "categories_"):
+        colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
+    elif ret_shape[1] == 1:
+        colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
+    elif ret_shape[1] > 1:
+        colnames = [
+            "_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
+        ]
+    return colnames
+
+
+def df_to_numeric(df, encoding='onehot'):
+    """
+    Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame to process.
+        encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
+
+    Returns:
+        tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
+    """
+    transformer = dfTransformer()
+    X_transformed = df.copy()
+
+    if isinstance(encoding, str):
+        encoder = _get_encoder(encoding)
+    elif callable(encoding):
+        encoder = encoding
+    else:
+        raise ValueError("Encoding must be either a string or a callable transformer.")
+
+    for column in X_transformed.columns:
+        if not pd.api.types.is_numeric_dtype(df[column]):
+            transformed_data = encoder.fit_transform(X_transformed[[column]])
+            result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
+            if isinstance(transformed_data, pd.DataFrame):
+                transformed_cols = transformed_data
+            else:
+                transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
+            transformed_cols.columns = result_columns
+
+            transformer.add_transform(column, encoder, result_columns=result_columns)
+
+            X_transformed.drop(columns=[column], inplace=True)
+            X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
+    
+    encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
+    for column in X_transformed.columns:
+        transformed_data = encoder.fit_transform(X_transformed[[column]])
+        transformer.add_transform(column, encoder, result_columns=["column"])
+        
+        X_transformed.drop(columns=[column], inplace=True)
+        X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
+
+    return X_transformed, transformer
--- a/microwave/math/init.py
+++ b/microwave/math/init.py
@@ -0,0 +1,39 @@
+import math
+from typing import Optional, Union
+import numpy as np
+
+
+def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
+    """
+    Calculates the height of a specified gaussian at point x
+    Args:
+        x: point(s) at which to calculate the height
+        mu: The gaussian's mean
+        sig: The gaussian's standard deviation
+    Returns:
+        The height(s), as unique number or ndarray if x is ndarray
+    Raises:
+        Nothing
+    """
+    return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
+
+
+def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
+    """
+    Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
+    Args:
+        mu: The gaussian's mean
+        sig: The gaussian's standard deviation
+        a: lower bound, -inf if None
+        b: upper bound, inf if None
+    Returns:
+        The undefinite integral
+    Raises:
+        Nothing
+    """
+    if sig == 0:
+        if mu >= a and mu < b:
+            return 1
+        else:
+            return 0
+    return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
--- a/microwave/utils/init.py
+++ b/microwave/utils/init.py
@@ -0,0 +1,9 @@
+from .arrayutils import _get_shape
+from .arrayutils import _verify_tabular_data_shape
+from .arrayutils import _verify_same_number_of_rows
+from .arrayutils import _sample
+from .arrayutils import sample_rows
+from .arrayutils import nan_rows_mask
+from .arrayutils import _to_series
+from .arrayutils import _is_convertible_to_numpy_array
+from .arrayutils import split_rows
--- a/microwave/utils/arrayutils.py
+++ b/microwave/utils/arrayutils.py
@@ -0,0 +1,224 @@
+from typing import Any, Union, Optional
+import numpy as np
+import pandas as pd
+
+
+def _get_shape(x: Any) -> tuple:
+    """
+    Returns the shape of a given object
+    Args:
+        x
+    Returns:
+        shape of x
+    Raises:
+        Nothing
+    """
+    if hasattr(x, "shape"):
+        return x.shape
+    elif _is_convertible_to_numpy_array(x):
+        return np.array(x).shape
+    else:
+        return None
+    
+
+def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
+    """
+    Verifies that the shape of the given objects are coherent for tabular data.
+    Args:
+        *args: shape[n,m] or [n,]; Tabular data.
+        is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
+    Returns:
+        Nothing
+    Raises:
+        ValueError: If one or more of the given objects is not coherent with tabular data.
+        ValueError: If is_column is true and one or more of the given objects have multiple columns.
+    """
+    for arg in args:
+        shape = _get_shape(arg)
+        if shape is None:
+            raise ValueError(f"Input data has no shape: {arg}.")
+        if len(shape) < 1 or len(shape) > 2:
+            raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
+        if is_column and len(shape) == 2 and 1 not in shape:
+            raise ValueError(f"Input data must be a single column. Has shape {shape}.")
+
+
+def _verify_same_number_of_rows(*args):
+    """
+    Verifies that the the given objects hve the same number of rows.
+    Args:
+        *args: shape[n,m] or [n,]
+    Returns:
+        Nothing
+    Raises:
+        ValueError: If one or more of the given objects has no rows.
+        ValueError: If two objects have different amounts of rows.
+    """
+    n_rows =_get_shape(args[0])[0]
+    for arg in args[1:]:
+        elem_rows = _get_shape(arg)[0]
+        if elem_rows is None:
+            raise ValueError(f"Input data has no shape: {arg}.")
+        elif n_rows != elem_rows:
+            raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
+        
+
+def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
+    """
+    Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
+    Args:
+        x: The array or DataFrame to be shuffled.
+        ind_list: The list or array of indices that defines the new order of the rows.
+    Returns:
+        The shuffled array or DataFrame.
+    Raises:
+        TypeError: If the input is neither a numpy array nor a pandas dataframe
+    """
+    if isinstance(x, np.ndarray):
+        return x[ind_list]
+    elif isinstance(x, pd.DataFrame):
+        return x.iloc[ind_list]
+    elif isinstance(x, pd.Series):
+        return x.iloc[ind_list]
+    else:
+        raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
+
+
+def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
+    """
+    Samples rows of the provided objects in the same way and optionally shuffles them.
+    Tries to minimize the amount of rows containing nan.
+    Args:
+        *args: Input tabular data objects.
+        sample: Number of samples to draw from each object. If None, no sampling is done.
+        shuffle: If True and sample is None, shuffles the objects.
+    
+    Returns:
+        Tuple of shuffled and/or sampled objects.
+    
+    Raises:
+        ValueError: If input objects don't have the same number of rows.
+    """
+    _verify_same_number_of_rows(*args)
+    n_rows = _get_shape(args[0])[0]
+    nan_mask = nan_rows_mask(*args)
+    full_indices = np.where(~nan_mask)[0]
+    n_full_rows = len(full_indices)
+    if sample is not None and sample < n_full_rows:
+        indices = np.random.choice(full_indices, size=sample, replace=False)
+        if not shuffle:
+            indices.sort()
+    elif sample is not None and sample < n_rows:
+        indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
+        if not shuffle:
+            indices.sort()
+    else:
+        indices = np.arange(n_rows)
+        if shuffle:
+            indices = np.random.choice(indices, size=n_rows, replace=False)
+
+    results = tuple(_sample(arg, indices) for arg in args)
+    return results
+
+
+def nan_rows_mask(*args: Any) -> np.ndarray:
+    """
+    Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
+    Args:
+        *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
+    Returns:
+        Boolean mask indicating rows with at least one NaN.
+    Raises:
+        Nothing
+    """    
+    # Initialize the mask with False values
+    _verify_same_number_of_rows(*args)
+    n_rows = _get_shape(args[0])[0]
+    mask = np.zeros(n_rows, dtype=bool)
+    for data in args:
+        if isinstance(data, np.ndarray):
+            if data.ndim == 1:
+                data = data.reshape(-1,1)
+            mask |= np.isnan(data).any(axis=1)
+        elif isinstance(data, pd.DataFrame):
+            mask |= data.isna().to_numpy().any(axis=1)
+        elif isinstance(data, pd.Series):
+            mask |= data.isna().to_numpy()
+        else:
+            data = np.array(data)
+            if data.ndim == 1:
+                data = data.reshape(-1,1)
+            mask |= np.isnan(data).any(axis=1)
+    return mask
+
+
+def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
+    """
+    Convert
+    Args:
+        *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
+    Returns:
+        Boolean mask indicating rows with at least one NaN.
+    Raises:
+        Nothing
+    """    
+    if isinstance(data, pd.Series):
+        return data
+    elif isinstance(data, pd.DataFrame):
+        if data.shape[1] != 1:
+            raise ValueError("DataFrame must have exactly one column to convert to Series")
+        return data.iloc[:, 0]
+    elif isinstance(data, np.ndarray):
+        if data.ndim == 1:
+            return pd.Series(data)
+        elif data.ndim == 2 and data.shape[1] == 1:
+            return pd.Series(data.ravel())
+        else:
+            raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
+        
+
+def _is_convertible_to_numpy_array(obj: Any) -> bool:
+    """
+    Verifies a given object is convertible to a numpy array without error
+    Args:
+        obj: object to check
+    Returns:
+        bool
+    Raises:
+        Nothing
+    """
+    if isinstance(obj, (list, tuple, dict, set)):
+        return True
+    if np.isscalar(obj):
+        return True
+    if hasattr(obj, '__array__'):
+        return True
+    return False
+
+
+def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
+    """
+    Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
+    Args:
+        data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
+        bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
+        drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
+    Returns:
+        A list of the resulting split pd.DataFrames np.ndarrays.
+    Raises
+        ValueError: If the length of `bool_array` does not match the length of `data`.
+    """
+    if len(bool_array) != len(data):
+        raise ValueError("The length of bool_array must match the length of data.")
+    
+    indices = np.where(bool_array)[0]
+    indices = np.concatenate(([0], indices, [len(data)]))
+    
+    if isinstance(data, pd.DataFrame):
+        return [data.iloc[start:end].reset_index(drop=drop_index) 
+                  for start, end in zip(indices[:-1], indices[1:]) 
+                  if start != end]
+    elif isinstance(data, np.ndarray):
+        return [data[start:end] 
+                  for start, end in zip(indices[:-1], indices[1:]) 
+                  if start != end]
--- a/notebooks/demo_ppscore.ipynb
+++ b/notebooks/demo_ppscore.ipynb
--- a/notebooks/demo_processing.ipynb
+++ b/notebooks/demo_processing.ipynb
@@ -0,0 +1,725 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>d</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>d</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>d</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    A  B  C  D  0\n",
+       "0   0  2  2  2  b\n",
+       "1   1  2  1  1  c\n",
+       "2   1  0  1  1  d\n",
+       "3   0  0  1  0  d\n",
+       "4   2  1  2  2  c\n",
+       "5   0  0  0  0  d\n",
+       "6   0  2  2  2  a\n",
+       "7   0  2  0  0  a\n",
+       "8   0  1  0  0  a\n",
+       "9   0  2  2  1  b\n",
+       "10  2  2  0  1  c\n",
+       "11  2  1  1  1  a\n",
+       "12  0  1  0  2  a\n",
+       "13  2  1  0  1  c\n",
+       "14  1  0  0  1  b"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
+    "df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd ..\n",
+    "import microwave.data_processing as dp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>0_encoded_a</th>\n",
+       "      <th>0_encoded_b</th>\n",
+       "      <th>0_encoded_c</th>\n",
+       "      <th>0_encoded_d</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      A    B    C    D  0_encoded_a  0_encoded_b  0_encoded_c  0_encoded_d\n",
+       "0   0.0  2.0  2.0  2.0          0.0          1.0          0.0          0.0\n",
+       "1   1.0  2.0  1.0  1.0          0.0          0.0          1.0          0.0\n",
+       "2   1.0  0.0  1.0  1.0          0.0          0.0          0.0          1.0\n",
+       "3   0.0  0.0  1.0  0.0          0.0          0.0          0.0          1.0\n",
+       "4   2.0  1.0  2.0  2.0          0.0          0.0          1.0          0.0\n",
+       "5   0.0  0.0  0.0  0.0          0.0          0.0          0.0          1.0\n",
+       "6   0.0  2.0  2.0  2.0          1.0          0.0          0.0          0.0\n",
+       "7   0.0  2.0  0.0  0.0          1.0          0.0          0.0          0.0\n",
+       "8   0.0  1.0  0.0  0.0          1.0          0.0          0.0          0.0\n",
+       "9   0.0  2.0  2.0  1.0          0.0          1.0          0.0          0.0\n",
+       "10  2.0  2.0  0.0  1.0          0.0          0.0          1.0          0.0\n",
+       "11  2.0  1.0  1.0  1.0          1.0          0.0          0.0          0.0\n",
+       "12  0.0  1.0  0.0  2.0          1.0          0.0          0.0          0.0\n",
+       "13  2.0  1.0  0.0  1.0          0.0          0.0          1.0          0.0\n",
+       "14  1.0  0.0  0.0  1.0          0.0          1.0          0.0          0.0"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_df, trans = dp.df_to_numeric(df)\n",
+    "new_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(0,\n",
+       "  OneHotEncoder(sparse_output=False),\n",
+       "  ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
+       " ('A',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('B',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('C',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('D',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('0_encoded_a',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('0_encoded_b',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('0_encoded_c',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column']),\n",
+       " ('0_encoded_d',\n",
+       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
+       "  ['column'])]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trans.transforms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, warn=True)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>A</th>\n",
+       "      <th>B</th>\n",
+       "      <th>C</th>\n",
+       "      <th>D</th>\n",
+       "      <th>0_encoded</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      A    B    C    D  0_encoded\n",
+       "0   0.0  2.0  2.0  2.0        1.0\n",
+       "1   1.0  2.0  1.0  1.0        2.0\n",
+       "2   1.0  0.0  1.0  1.0        3.0\n",
+       "3   0.0  0.0  1.0  0.0        3.0\n",
+       "4   2.0  1.0  2.0  2.0        2.0\n",
+       "5   0.0  0.0  0.0  0.0        3.0\n",
+       "6   0.0  2.0  2.0  2.0        0.0\n",
+       "7   0.0  2.0  0.0  0.0        0.0\n",
+       "8   0.0  1.0  0.0  0.0        0.0\n",
+       "9   0.0  2.0  2.0  1.0        1.0\n",
+       "10  2.0  2.0  0.0  1.0        2.0\n",
+       "11  2.0  1.0  1.0  1.0        0.0\n",
+       "12  0.0  1.0  0.0  2.0        0.0\n",
+       "13  2.0  1.0  0.0  1.0        2.0\n",
+       "14  1.0  0.0  0.0  1.0        1.0"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
+    "new_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv_microwave (3.13.2)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/demo_univariate_aggregates.ipynb
+++ b/notebooks/demo_univariate_aggregates.ipynb
--- a/requirements.txt
+++ b/requirements.txt