Dump

2025-02-27 13:46:56 +01:00
16 changed files with 3720 additions and 0 deletions
--- a/microwave/init.py
+++ b/microwave/init.py
@@ -0,0 +1,3 @@
 from . import data_analysis
 from . import utils
 from . import math
--- a/microwave/data_analysis/init.py
+++ b/microwave/data_analysis/init.py
@@ -0,0 +1,2 @@
 from . import ppscore
 from . import univariate
--- a/microwave/data_analysis/ppscore/init.py
+++ b/microwave/data_analysis/ppscore/init.py
@@ -0,0 +1 @@
 from .ppscore import *
--- a/microwave/data_analysis/ppscore/ppscore.py
+++ b/microwave/data_analysis/ppscore/ppscore.py
@@ -0,0 +1,276 @@
 from typing import Union, Callable, Optional, Any
 from sklearn.metrics import make_scorer
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.base import is_classifier, is_regressor
 from sklearn.model_selection import cross_val_score
 from joblib import Parallel, delayed
 from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
 import pandas as pd
 import numpy as np
 def _identify_case(model) -> str:
    """
    Identifies if the given model is a classifier or regressor.
    Args:
        model: Must be sklearn-compatible and either a regressor of classifier.
    Returns:
        "classification" or "regression"
    Raises:
        ValueError: If the model cannot be determined to be either a classifier or a regressor
    """
    if is_classifier(model):
        return "classification"
    elif is_regressor(model):
        return "regression"
    else:
        raise ValueError("The model cannot be determined to be either a classifier or a regressor")
 def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
    """
    Calculates the expected metric result of a naive model against y.
    Args:
        y: shape[n,1]; True values
        case: "classification" or "regression"
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
    Returns:
        A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
    Raises:
        Nothing
    """
    y = _sample(y, ~nan_rows_mask(y))
    if case == "regression":
        base = np.full_like(y, np.median(y))
    elif case == "classification":
        values, counts = np.unique(y, return_counts=True)
        ind = np.argmax(counts)
        base = np.full_like(y, values[ind])
    return metric(y, base)
 def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
    """
    Calculates the base information depending on the model, metric and true values.
    Args:
        x: shape[n, m]; Predictors (features).
        y: shape[n, 1]; True values (targets).
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
        model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
    Returns:
        A DataFrame containing the pps informations, including:
            - ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
            - case: The type of model ("classifier" or "regressor").
            - metric: The name of the metric used.
            - perfect_score: The score when the model's predictions are perfect.
            - naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
            - model_score: Placeholder for the model score, initialized to NaN.
            - model: The type of the model.
    Raises:
        Nothing
    """
    y = _sample(y, ~nan_rows_mask(y))
    case = _identify_case(model)
    baseline_score = _get_baseline_score(y, case, metric)
    perfect_score = metric(y, y)
    return pd.DataFrame({
        "ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
        "case": case,
        "metric": metric.__name__,
        "perfect_score": perfect_score,
        "naive_score": baseline_score,
        "model_score": np.nan,
        "model": type(model).__name__
    })
 def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
    """
    Returns the score according to the given metric of a fitted model
    Args:
        x: shape[n, m]; Predictors (features).
        y: shape[n,1]; True values (targets).
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
        model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
        metric_params (optional): Additional parameters to pass to the metric function.
        crossvals (optional) [default=5]: Number of cross-validations to perform.
    Returns:
        The evaluation metric on the prediction of the model
    Raises:
        Nothing
    """
    if model is None:
        model = DecisionTreeRegressor()
    nan_mask = nan_rows_mask(x, y)
    scores = cross_val_score(
        model, 
        np.array(_sample(x, ~nan_mask)).reshape(-1, 1), 
        np.array(_sample(y, ~nan_mask)).reshape(-1, 1), 
        cv=crossvals, 
        scoring=make_scorer(metric, **metric_params)
    )
    return scores.mean()
 def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series], 
                  naive_score: Union[int, float, np.ndarray, pd.Series], 
                  perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """
    Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
    Args:
        score: The actual score(s).
        naive_score: The naive score(s).
        perfect_score: The perfect score(s).
    Returns:
        The predictive power score(s).
    """
    score = np.asarray(score)
    naive_score = np.asarray(naive_score)
    perfect_score = np.asarray(perfect_score)
    pps = (score - naive_score) / (perfect_score - naive_score)
    pps = np.where(pps <= 0, 0, pps)
    if isinstance(score, pd.Series):
        return pd.Series(pps, index=score.index)
    return pps
 def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
    """
    Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
    Args:
        x: shape[n,1] or [n,]; Predictors (features).
        y: shape[n,1] or [n,]; True values (targets).
        metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
        sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
        shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
        crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
    Returns:
        The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
    Raises:
        Nothing
    """
    _verify_tabular_data_shape(x, y, is_column=True)
    x, y = _to_series(x), _to_series(y)
    x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
    if model is None:
        model = DecisionTreeRegressor()
    res_df = _prepare_df(x, y, metric, model)
    res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
    res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
    return res_df
 def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
    """
    Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
    Args:
        df: shape[n, m]; Predictors (features).
        y: shape[n, 1]; True values (targets).
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
        crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
        njobs (optional) [default=1]: Number of jobs to run in parallel.
        sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
        sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
        shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
    Returns:
        A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
    Raises:
        Nothing
    """
    _verify_tabular_data_shape(df, is_column=False)
    _verify_tabular_data_shape(y, is_column=True)
    y = _to_series(y)
    df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
    if model is None:
        model = DecisionTreeRegressor()
    res_df = _prepare_df(df, y, metric, model)
    res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
    res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
    res_df.insert(0, "x", df.columns)
    if sort:
        res_df = res_df.sort_values("ppscore", ascending=False)
    return res_df.reset_index(drop=True)
 def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
    """
    Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
    Args:
        df: Input DataFrame containing predictors and the target column.
        col: Target column name.
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
        model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
    Returns:
        A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
    Raises:
        Nothing
    """
    y = df[[col]]
    df_pred = df.loc[:, df.columns != col]
    res = predictors(df_pred, y, metric, model=model, **kwargs)
    res.insert(1, "y", col)
    return res
 def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
    """
    Calculates the predictive power score (pps) of every column in df against every other column in df using a given model. 
    Scores will be baselined between 0 and 1 depending on the nature of the problem.
    Args:
        df: shape[n, m]
        metric: Metric to use to score the prediction. Must take in y_true, y_pred.
        model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
        crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
        njobs (optional) [default=1]: Number of jobs to run in parallel.
        sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
        sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
        shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
    Returns:
        A DataFrame containing the pps of each predictor in df against every target column. 
        The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
    Raises:
        Nothing
    """
    _verify_tabular_data_shape(df, is_column=False)
    df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
    if model is None:
        model = DecisionTreeRegressor()
    res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
    res_df = pd.concat(res, axis=0)
    if sort:
        res_df = res_df.sort_values("ppscore", ascending=False)
    return res_df.reset_index(drop=True)
 def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
    """
    Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
    For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations. 
    Continues until no considered pps is over the threshold.
    Args:
        matrix: A DataFrame containing the pps of each feature against every other.
        threshold (optional) [default=0.9]
    Returns:
        A list of features that are the most predicted by others
    Raises:
        Nothing
    """
    features = matrix["x"].unique()
    cols_predict_count = dict(zip(features, [0]*len(features)))
    pred_mut = []
    while True:
        for _, predict in matrix.iterrows():
            if predict.y in cols_predict_count and predict.ppscore > threshold:
                cols_predict_count[predict.y] += predict.ppscore
        if sum(cols_predict_count.values()) == 0:
            break
        else:
            best_predictor = max(cols_predict_count, key=cols_predict_count.get)
            pred_mut.append(best_predictor)
            del cols_predict_count[best_predictor]
            matrix = matrix[matrix["x"] != best_predictor]
            matrix = matrix[matrix["y"] != best_predictor]
    return pred_mut
--- a/microwave/data_analysis/univariate/init.py
+++ b/microwave/data_analysis/univariate/init.py
@@ -0,0 +1 @@
 from .aggregates import *
--- a/microwave/data_analysis/univariate/aggregates.py
+++ b/microwave/data_analysis/univariate/aggregates.py
@@ -0,0 +1,236 @@
 from typing import Any, Union, Optional, Callable
 from joblib import Parallel, delayed
 import numpy as np
 import pandas as pd
 import scipy.stats as stats
 def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
    """
    Calculates the mean within a certain percentile range
    Args:
        x: The considered ndarray.
        N1: Lower percentile (between 0 and 1)
        N2: Upper percentile (between 0 and 1)
    Returns:
        The IP-mean
    Raises:
        Nothing
    """
    p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
    return np.mean(x[(x >= p1) & (x <= p2)])
 def median_absolute_deviation(x: np.ndarray) -> float:
    """
    Calculates the median of the deviations from the median
    Args:
        x: The considered ndarray.
    Returns:
        The MAD
    Raises:
        Nothing
    """
    return np.nanmedian(np.abs(x - np.nanmedian(x)))
 def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
    """
    Calculates the range within a certain percentile range
    Args:
        x: The considered ndarray.
        N1: Lower percentile (between 0 and 1)
        N2: Upper percentile (between 0 and 1)
    Returns:
        The IP-range
    Raises:
        Nothing
    """
    return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
 def mode(x: np.ndarray) -> Any:
    """
    Calculates the mode of numeric and categorical variables
    Args:
        x: The considered ndarray.
    Returns:
        The mode
    Raises:
        Nothing
    """
    if pd.api.types.is_numeric_dtype(x):
        return stats.mode(x, nan_policy='omit').mode
    else:
        return pd.Series(x).mode().iat[0]
 def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
    """
    https://xkcd.com/2435/
    Args:
        x: The considered ndarray.
        iter: 
    Returns:
        The geothmetic meandian
    Raises:
        Nothing
    """
    if iter == 0:
        return x[0]
    return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
 def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
    """
    Calculates the amount of outliers with the zscore method.
    Args:
        x: The considered ndarray.
        n_sig: Number of standard deviations before being considered an outlier
    Returns:
        The number of outliers
    Raises:
        Nothing
    """
    return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
 AGGFUNCCODES = {
    # Counts
    "size": len,
    "non-null": lambda x: len(x) - pd.isna(x).sum(),
    "nunique": lambda x: pd.Series(x).nunique(dropna=True),
    # Basic
    "sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
    "min": np.nanmin,
    "max": np.nanmax,
    "first": lambda x: x[~pd.isna(x)][0],
    "last": lambda x: x[~pd.isna(x)][-1],
    # Centricity
    "mean": np.nanmean,
    "median": np.nanmedian,
    "mode": mode,
    "gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
    "hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
    "Pmean": inter_percentile_mean,
    "geothmetic meandian": geothmetic_meandian,
    # Dispersion
    "variance": np.nanvar,
    "std": np.nanstd,
    "mad": median_absolute_deviation,
    "skewness": lambda x: stats.skew(x, nan_policy='omit'),
    "excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
    "range": lambda x: np.nanmax(x) - np.nanmin(x),
    "Prange": inter_percentile_range,
    "n_outliers": get_n_outliers,
    # Percentiles
    "P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
    "P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
    "P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
    "P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
    "PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
    # Distribution
    "skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
    "kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
    "normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
    "jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
    "shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
    "anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
    # Other
    "energy": lambda x: np.nansum(x**2),
    "rms": lambda x: np.sqrt(np.nanmean(x**2)),
    "entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
    "autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
 }
 def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
    """
    Executes a given aggregation function on a given data.
    If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
    Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
    Args:
        x: The data to execute the function on.
        agg: The function to execute.
    Returns:
        Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
    Raises:
        Nothing
    """
    ret_names = None
    if callable(agg):
        ret = agg(x)
    elif isinstance(agg, str):
        ret = AGGFUNCCODES[agg](x)
    elif isinstance(agg, dict):
        kwargs = {} if 'kwargs' not in agg else agg['kwargs']
        if callable(agg['func']):
            ret = agg['func'](x, **kwargs)
        elif isinstance(agg['func'], str):
            ret = AGGFUNCCODES[agg['func']](x, **kwargs)
        if 'ret_names' in agg:
            ret_names = agg['ret_names']
    if isinstance(ret, tuple):
        if ret_names is None:
            ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
        return dict(zip(ret_names, ret))
    else:
        return ret
 def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
    """
    Executes a given list of aggregation functions on a given data.
    Args:
        x: The data to execute the functions on.
        aggs: The functions to execute.
    Returns:
        A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
    Raises:
        Nothing
    """
    results = {}
    for i, func in enumerate(aggs):
        funcname = f"func_{i}"
        try:
            ret = execute_agg_func(x, func)
        except Exception as e:
            print(f"agg_{i}", func, e)
            ret = np.nan
        if isinstance(func, dict):
            if 'name' in func:
                funcname = func['name']
            elif isinstance(func['func'], str):
                funcname = func['func']
        elif isinstance(func, str):
            funcname = func if func not in results.keys() else f"{func}_{i}"
        if isinstance(ret, dict):
            results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
        else:
            results[funcname] = ret
    return results
 def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
    """
    Calculates specified univariate statistics for each column in the DataFrame.
    Args:
        df: The input DataFrame.
        agg: List of aggregation functions to apply.
            Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
        n_jobs: number of parallel processes to open. -1 means as many as possible.
    Returns:
        pd.DataFrame: DataFrame with one row per specified aggregation.
    Raises:
        Nothing
    """
    if agg == "all":
        agg = list(AGGFUNCCODES.keys())
    results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
    return pd.DataFrame(results, index=df.columns)
--- a/microwave/data_processing/init.py
+++ b/microwave/data_processing/init.py
@@ -0,0 +1,2 @@
 from .df_preprocessing import *
 from .dfTransformer import dfTransformer
--- a/microwave/data_processing/dfTransformer.py
+++ b/microwave/data_processing/dfTransformer.py
@@ -0,0 +1,75 @@
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from typing import Optional
 class dfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.transforms = []
    def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
        """
        Adds a transform specific to a column with optional result column names.
        Args:
            column_name (str): The name of the column to transform. For dfTransformer, use None.
            transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
            result_columns (list of str, optional): List of names for the resulting columns. Default is None.
        """
        if not hasattr(transformer, 'transform'):
            raise ValueError("The transformer must have a 'transform' method.")
        self.transforms.append((column_name, transformer, result_columns))
    def fit(self, X: pd.DataFrame, y=None):
        """
        Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
        Args:
            X (pd.DataFrame): The DataFrame to fit.
            y: Ignored.
        Returns:
            self: Fitted transformer.
        """
        for column_name, transformer, _ in self.transforms:
            if isinstance(transformer, dfTransformer):
                transformer.fit(X, y)
            elif column_name in X:
                transformer.fit(X[[column_name]], y)  # Fit the transformer on the specific column
        return self
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Applies all stored transformations to the DataFrame, drops the original columns,
        and returns the transformed DataFrame.
        Args:
            X (pd.DataFrame): The DataFrame to transform.
        Returns:
            pd.DataFrame: The transformed DataFrame.
        """
        X_transformed = X.copy()
        for column_name, transformer, result_columns in self.transforms:
            if isinstance(transformer, dfTransformer):
                X_transformed = transformer.transform(X_transformed)
            if column_name in X_transformed:
                transformed_data = transformer.transform(X_transformed[[column_name]])
                # Check if the transformed data is a DataFrame; if not, convert it
                if isinstance(transformed_data, pd.DataFrame):
                    transformed_cols = transformed_data
                else:
                    transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
                    if result_columns:
                        transformed_cols.columns = result_columns
                    else:
                        transformed_cols.columns = [
                            f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
                        ]
                X_transformed.drop(columns=[column_name], inplace=True)
                X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
        return X_transformed
--- a/microwave/data_processing/df_preprocessing.py
+++ b/microwave/data_processing/df_preprocessing.py
@@ -0,0 +1,75 @@
 import pandas as pd
 from .dfTransformer import dfTransformer
 from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
 def _get_encoder(encoding):
    if encoding == 'onehot':
        return OneHotEncoder(sparse_output=False)
    elif encoding == 'label':
        return LabelEncoder()
    elif encoding == 'ordinal':
        return OrdinalEncoder()
    else:
        raise ValueError("Unsupported encoding type.")
 def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
    if len(ret_shape) == 1:
        ret_shape.append(1)
    if hasattr(encoder, "categories_"):
        colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
    elif ret_shape[1] == 1:
        colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
    elif ret_shape[1] > 1:
        colnames = [
            "_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
        ]
    return colnames
 def df_to_numeric(df, encoding='onehot'):
    """
    Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
    Args:
        df (pd.DataFrame): The input DataFrame to process.
        encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
    Returns:
        tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
    """
    transformer = dfTransformer()
    X_transformed = df.copy()
    if isinstance(encoding, str):
        encoder = _get_encoder(encoding)
    elif callable(encoding):
        encoder = encoding
    else:
        raise ValueError("Encoding must be either a string or a callable transformer.")
    for column in X_transformed.columns:
        if not pd.api.types.is_numeric_dtype(df[column]):
            transformed_data = encoder.fit_transform(X_transformed[[column]])
            result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
            if isinstance(transformed_data, pd.DataFrame):
                transformed_cols = transformed_data
            else:
                transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
            transformed_cols.columns = result_columns
            transformer.add_transform(column, encoder, result_columns=result_columns)
            X_transformed.drop(columns=[column], inplace=True)
            X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
    encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
    for column in X_transformed.columns:
        transformed_data = encoder.fit_transform(X_transformed[[column]])
        transformer.add_transform(column, encoder, result_columns=["column"])
        X_transformed.drop(columns=[column], inplace=True)
        X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
    return X_transformed, transformer
--- a/microwave/math/init.py
+++ b/microwave/math/init.py
@@ -0,0 +1,39 @@
 import math
 from typing import Optional, Union
 import numpy as np
 def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
    """
    Calculates the height of a specified gaussian at point x
    Args:
        x: point(s) at which to calculate the height
        mu: The gaussian's mean
        sig: The gaussian's standard deviation
    Returns:
        The height(s), as unique number or ndarray if x is ndarray
    Raises:
        Nothing
    """
    return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
 def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
    """
    Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
    Args:
        mu: The gaussian's mean
        sig: The gaussian's standard deviation
        a: lower bound, -inf if None
        b: upper bound, inf if None
    Returns:
        The undefinite integral
    Raises:
        Nothing
    """
    if sig == 0:
        if mu >= a and mu < b:
            return 1
        else:
            return 0
    return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
--- a/microwave/utils/init.py
+++ b/microwave/utils/init.py
@@ -0,0 +1,9 @@
 from .arrayutils import _get_shape
 from .arrayutils import _verify_tabular_data_shape
 from .arrayutils import _verify_same_number_of_rows
 from .arrayutils import _sample
 from .arrayutils import sample_rows
 from .arrayutils import nan_rows_mask
 from .arrayutils import _to_series
 from .arrayutils import _is_convertible_to_numpy_array
 from .arrayutils import split_rows
--- a/microwave/utils/arrayutils.py
+++ b/microwave/utils/arrayutils.py
@@ -0,0 +1,224 @@
 from typing import Any, Union, Optional
 import numpy as np
 import pandas as pd
 def _get_shape(x: Any) -> tuple:
    """
    Returns the shape of a given object
    Args:
        x
    Returns:
        shape of x
    Raises:
        Nothing
    """
    if hasattr(x, "shape"):
        return x.shape
    elif _is_convertible_to_numpy_array(x):
        return np.array(x).shape
    else:
        return None
 def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
    """
    Verifies that the shape of the given objects are coherent for tabular data.
    Args:
        *args: shape[n,m] or [n,]; Tabular data.
        is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
    Returns:
        Nothing
    Raises:
        ValueError: If one or more of the given objects is not coherent with tabular data.
        ValueError: If is_column is true and one or more of the given objects have multiple columns.
    """
    for arg in args:
        shape = _get_shape(arg)
        if shape is None:
            raise ValueError(f"Input data has no shape: {arg}.")
        if len(shape) < 1 or len(shape) > 2:
            raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
        if is_column and len(shape) == 2 and 1 not in shape:
            raise ValueError(f"Input data must be a single column. Has shape {shape}.")
 def _verify_same_number_of_rows(*args):
    """
    Verifies that the the given objects hve the same number of rows.
    Args:
        *args: shape[n,m] or [n,]
    Returns:
        Nothing
    Raises:
        ValueError: If one or more of the given objects has no rows.
        ValueError: If two objects have different amounts of rows.
    """
    n_rows =_get_shape(args[0])[0]
    for arg in args[1:]:
        elem_rows = _get_shape(arg)[0]
        if elem_rows is None:
            raise ValueError(f"Input data has no shape: {arg}.")
        elif n_rows != elem_rows:
            raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
 def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
    """
    Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
    Args:
        x: The array or DataFrame to be shuffled.
        ind_list: The list or array of indices that defines the new order of the rows.
    Returns:
        The shuffled array or DataFrame.
    Raises:
        TypeError: If the input is neither a numpy array nor a pandas dataframe
    """
    if isinstance(x, np.ndarray):
        return x[ind_list]
    elif isinstance(x, pd.DataFrame):
        return x.iloc[ind_list]
    elif isinstance(x, pd.Series):
        return x.iloc[ind_list]
    else:
        raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
 def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
    """
    Samples rows of the provided objects in the same way and optionally shuffles them.
    Tries to minimize the amount of rows containing nan.
    Args:
        *args: Input tabular data objects.
        sample: Number of samples to draw from each object. If None, no sampling is done.
        shuffle: If True and sample is None, shuffles the objects.
    Returns:
        Tuple of shuffled and/or sampled objects.
    Raises:
        ValueError: If input objects don't have the same number of rows.
    """
    _verify_same_number_of_rows(*args)
    n_rows = _get_shape(args[0])[0]
    nan_mask = nan_rows_mask(*args)
    full_indices = np.where(~nan_mask)[0]
    n_full_rows = len(full_indices)
    if sample is not None and sample < n_full_rows:
        indices = np.random.choice(full_indices, size=sample, replace=False)
        if not shuffle:
            indices.sort()
    elif sample is not None and sample < n_rows:
        indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
        if not shuffle:
            indices.sort()
    else:
        indices = np.arange(n_rows)
        if shuffle:
            indices = np.random.choice(indices, size=n_rows, replace=False)
    results = tuple(_sample(arg, indices) for arg in args)
    return results
 def nan_rows_mask(*args: Any) -> np.ndarray:
    """
    Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
    Args:
        *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
    Returns:
        Boolean mask indicating rows with at least one NaN.
    Raises:
        Nothing
    """    
    # Initialize the mask with False values
    _verify_same_number_of_rows(*args)
    n_rows = _get_shape(args[0])[0]
    mask = np.zeros(n_rows, dtype=bool)
    for data in args:
        if isinstance(data, np.ndarray):
            if data.ndim == 1:
                data = data.reshape(-1,1)
            mask |= np.isnan(data).any(axis=1)
        elif isinstance(data, pd.DataFrame):
            mask |= data.isna().to_numpy().any(axis=1)
        elif isinstance(data, pd.Series):
            mask |= data.isna().to_numpy()
        else:
            data = np.array(data)
            if data.ndim == 1:
                data = data.reshape(-1,1)
            mask |= np.isnan(data).any(axis=1)
    return mask
 def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
    """
    Convert
    Args:
        *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
    Returns:
        Boolean mask indicating rows with at least one NaN.
    Raises:
        Nothing
    """    
    if isinstance(data, pd.Series):
        return data
    elif isinstance(data, pd.DataFrame):
        if data.shape[1] != 1:
            raise ValueError("DataFrame must have exactly one column to convert to Series")
        return data.iloc[:, 0]
    elif isinstance(data, np.ndarray):
        if data.ndim == 1:
            return pd.Series(data)
        elif data.ndim == 2 and data.shape[1] == 1:
            return pd.Series(data.ravel())
        else:
            raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
 def _is_convertible_to_numpy_array(obj: Any) -> bool:
    """
    Verifies a given object is convertible to a numpy array without error
    Args:
        obj: object to check
    Returns:
        bool
    Raises:
        Nothing
    """
    if isinstance(obj, (list, tuple, dict, set)):
        return True
    if np.isscalar(obj):
        return True
    if hasattr(obj, '__array__'):
        return True
    return False
 def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
    """
    Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
    Args:
        data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
        bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
        drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
    Returns:
        A list of the resulting split pd.DataFrames np.ndarrays.
    Raises
        ValueError: If the length of `bool_array` does not match the length of `data`.
    """
    if len(bool_array) != len(data):
        raise ValueError("The length of bool_array must match the length of data.")
    indices = np.where(bool_array)[0]
    indices = np.concatenate(([0], indices, [len(data)]))
    if isinstance(data, pd.DataFrame):
        return [data.iloc[start:end].reset_index(drop=drop_index) 
                  for start, end in zip(indices[:-1], indices[1:]) 
                  if start != end]
    elif isinstance(data, np.ndarray):
        return [data[start:end] 
                  for start, end in zip(indices[:-1], indices[1:]) 
                  if start != end]
--- a/notebooks/demo_ppscore.ipynb
+++ b/notebooks/demo_ppscore.ipynb
--- a/notebooks/demo_processing.ipynb
+++ b/notebooks/demo_processing.ipynb
@@ -0,0 +1,725 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A  B  C  D  0\n",
       "0   0  2  2  2  b\n",
       "1   1  2  1  1  c\n",
       "2   1  0  1  1  d\n",
       "3   0  0  1  0  d\n",
       "4   2  1  2  2  c\n",
       "5   0  0  0  0  d\n",
       "6   0  2  2  2  a\n",
       "7   0  2  0  0  a\n",
       "8   0  1  0  0  a\n",
       "9   0  2  2  1  b\n",
       "10  2  2  0  1  c\n",
       "11  2  1  1  1  a\n",
       "12  0  1  0  2  a\n",
       "13  2  1  0  1  c\n",
       "14  1  0  0  1  b"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
    "df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
     ]
    }
   ],
   "source": [
    "%cd ..\n",
    "import microwave.data_processing as dp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>0_encoded_a</th>\n",
       "      <th>0_encoded_b</th>\n",
       "      <th>0_encoded_c</th>\n",
       "      <th>0_encoded_d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      A    B    C    D  0_encoded_a  0_encoded_b  0_encoded_c  0_encoded_d\n",
       "0   0.0  2.0  2.0  2.0          0.0          1.0          0.0          0.0\n",
       "1   1.0  2.0  1.0  1.0          0.0          0.0          1.0          0.0\n",
       "2   1.0  0.0  1.0  1.0          0.0          0.0          0.0          1.0\n",
       "3   0.0  0.0  1.0  0.0          0.0          0.0          0.0          1.0\n",
       "4   2.0  1.0  2.0  2.0          0.0          0.0          1.0          0.0\n",
       "5   0.0  0.0  0.0  0.0          0.0          0.0          0.0          1.0\n",
       "6   0.0  2.0  2.0  2.0          1.0          0.0          0.0          0.0\n",
       "7   0.0  2.0  0.0  0.0          1.0          0.0          0.0          0.0\n",
       "8   0.0  1.0  0.0  0.0          1.0          0.0          0.0          0.0\n",
       "9   0.0  2.0  2.0  1.0          0.0          1.0          0.0          0.0\n",
       "10  2.0  2.0  0.0  1.0          0.0          0.0          1.0          0.0\n",
       "11  2.0  1.0  1.0  1.0          1.0          0.0          0.0          0.0\n",
       "12  0.0  1.0  0.0  2.0          1.0          0.0          0.0          0.0\n",
       "13  2.0  1.0  0.0  1.0          0.0          0.0          1.0          0.0\n",
       "14  1.0  0.0  0.0  1.0          0.0          1.0          0.0          0.0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_df, trans = dp.df_to_numeric(df)\n",
    "new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  OneHotEncoder(sparse_output=False),\n",
       "  ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
       " ('A',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('B',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('C',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('D',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('0_encoded_a',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('0_encoded_b',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('0_encoded_c',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column']),\n",
       " ('0_encoded_d',\n",
       "  FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
       "  ['column'])]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trans.transforms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>0_encoded</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      A    B    C    D  0_encoded\n",
       "0   0.0  2.0  2.0  2.0        1.0\n",
       "1   1.0  2.0  1.0  1.0        2.0\n",
       "2   1.0  0.0  1.0  1.0        3.0\n",
       "3   0.0  0.0  1.0  0.0        3.0\n",
       "4   2.0  1.0  2.0  2.0        2.0\n",
       "5   0.0  0.0  0.0  0.0        3.0\n",
       "6   0.0  2.0  2.0  2.0        0.0\n",
       "7   0.0  2.0  0.0  0.0        0.0\n",
       "8   0.0  1.0  0.0  0.0        0.0\n",
       "9   0.0  2.0  2.0  1.0        1.0\n",
       "10  2.0  2.0  0.0  1.0        2.0\n",
       "11  2.0  1.0  1.0  1.0        0.0\n",
       "12  0.0  1.0  0.0  2.0        0.0\n",
       "13  2.0  1.0  0.0  1.0        2.0\n",
       "14  1.0  0.0  0.0  1.0        1.0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
    "new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv_microwave (3.13.2)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/notebooks/demo_univariate_aggregates.ipynb
+++ b/notebooks/demo_univariate_aggregates.ipynb
--- a/requirements.txt
+++ b/requirements.txt
		`@@ -0,0 +1,2 @@`
							`from . import ppscore`
							`from . import univariate`
		`@@ -0,0 +1,2 @@`
							`from .df_preprocessing import *`
							`from .dfTransformer import dfTransformer`