diff --git a/microwave/__init__.py b/microwave/__init__.py
new file mode 100644
index 0000000..a19e12c
--- /dev/null
+++ b/microwave/__init__.py
@@ -0,0 +1,3 @@
+from . import data_analysis
+from . import utils
+from . import math
\ No newline at end of file
diff --git a/microwave/data_analysis/__init__.py b/microwave/data_analysis/__init__.py
new file mode 100644
index 0000000..4a8885c
--- /dev/null
+++ b/microwave/data_analysis/__init__.py
@@ -0,0 +1,2 @@
+from . import ppscore
+from . import univariate
\ No newline at end of file
diff --git a/microwave/data_analysis/ppscore/__init__.py b/microwave/data_analysis/ppscore/__init__.py
new file mode 100644
index 0000000..14b9e1f
--- /dev/null
+++ b/microwave/data_analysis/ppscore/__init__.py
@@ -0,0 +1 @@
+from .ppscore import *
\ No newline at end of file
diff --git a/microwave/data_analysis/ppscore/ppscore.py b/microwave/data_analysis/ppscore/ppscore.py
new file mode 100644
index 0000000..1f9ffd3
--- /dev/null
+++ b/microwave/data_analysis/ppscore/ppscore.py
@@ -0,0 +1,276 @@
+from typing import Union, Callable, Optional, Any
+from sklearn.metrics import make_scorer
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.base import is_classifier, is_regressor
+from sklearn.model_selection import cross_val_score
+from joblib import Parallel, delayed
+from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
+import pandas as pd
+import numpy as np
+
+
+def _identify_case(model) -> str:
+ """
+ Identifies if the given model is a classifier or regressor.
+ Args:
+ model: Must be sklearn-compatible and either a regressor of classifier.
+ Returns:
+ "classification" or "regression"
+ Raises:
+ ValueError: If the model cannot be determined to be either a classifier or a regressor
+ """
+ if is_classifier(model):
+ return "classification"
+ elif is_regressor(model):
+ return "regression"
+ else:
+ raise ValueError("The model cannot be determined to be either a classifier or a regressor")
+
+
+def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
+ """
+ Calculates the expected metric result of a naive model against y.
+ Args:
+ y: shape[n,1]; True values
+ case: "classification" or "regression"
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ Returns:
+ A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
+ Raises:
+ Nothing
+ """
+ y = _sample(y, ~nan_rows_mask(y))
+ if case == "regression":
+ base = np.full_like(y, np.median(y))
+ elif case == "classification":
+ values, counts = np.unique(y, return_counts=True)
+ ind = np.argmax(counts)
+ base = np.full_like(y, values[ind])
+ return metric(y, base)
+
+
+def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
+ """
+ Calculates the base information depending on the model, metric and true values.
+ Args:
+ x: shape[n, m]; Predictors (features).
+ y: shape[n, 1]; True values (targets).
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
+ Returns:
+ A DataFrame containing the pps informations, including:
+ - ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
+ - case: The type of model ("classifier" or "regressor").
+ - metric: The name of the metric used.
+ - perfect_score: The score when the model's predictions are perfect.
+ - naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
+ - model_score: Placeholder for the model score, initialized to NaN.
+ - model: The type of the model.
+ Raises:
+ Nothing
+ """
+ y = _sample(y, ~nan_rows_mask(y))
+ case = _identify_case(model)
+ baseline_score = _get_baseline_score(y, case, metric)
+ perfect_score = metric(y, y)
+ return pd.DataFrame({
+ "ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
+ "case": case,
+ "metric": metric.__name__,
+ "perfect_score": perfect_score,
+ "naive_score": baseline_score,
+ "model_score": np.nan,
+ "model": type(model).__name__
+ })
+
+
+def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
+ """
+ Returns the score according to the given metric of a fitted model
+ Args:
+ x: shape[n, m]; Predictors (features).
+ y: shape[n,1]; True values (targets).
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
+ metric_params (optional): Additional parameters to pass to the metric function.
+ crossvals (optional) [default=5]: Number of cross-validations to perform.
+ Returns:
+ The evaluation metric on the prediction of the model
+ Raises:
+ Nothing
+ """
+ if model is None:
+ model = DecisionTreeRegressor()
+ nan_mask = nan_rows_mask(x, y)
+ scores = cross_val_score(
+ model,
+ np.array(_sample(x, ~nan_mask)).reshape(-1, 1),
+ np.array(_sample(y, ~nan_mask)).reshape(-1, 1),
+ cv=crossvals,
+ scoring=make_scorer(metric, **metric_params)
+ )
+ return scores.mean()
+
+
+def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series],
+ naive_score: Union[int, float, np.ndarray, pd.Series],
+ perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
+ """
+ Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
+ Args:
+ score: The actual score(s).
+ naive_score: The naive score(s).
+ perfect_score: The perfect score(s).
+ Returns:
+ The predictive power score(s).
+ """
+ score = np.asarray(score)
+ naive_score = np.asarray(naive_score)
+ perfect_score = np.asarray(perfect_score)
+ pps = (score - naive_score) / (perfect_score - naive_score)
+ pps = np.where(pps <= 0, 0, pps)
+ if isinstance(score, pd.Series):
+ return pd.Series(pps, index=score.index)
+ return pps
+
+
+def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
+ """
+ Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
+ Args:
+ x: shape[n,1] or [n,]; Predictors (features).
+ y: shape[n,1] or [n,]; True values (targets).
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
+ model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
+ sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
+ shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
+ crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
+ Returns:
+ The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
+ Raises:
+ Nothing
+ """
+ _verify_tabular_data_shape(x, y, is_column=True)
+ x, y = _to_series(x), _to_series(y)
+ x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
+ if model is None:
+ model = DecisionTreeRegressor()
+ res_df = _prepare_df(x, y, metric, model)
+ res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
+ res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
+ return res_df
+
+
+def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
+ """
+ Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
+ Args:
+ df: shape[n, m]; Predictors (features).
+ y: shape[n, 1]; True values (targets).
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
+ crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
+ njobs (optional) [default=1]: Number of jobs to run in parallel.
+ sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
+ sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
+ shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
+ Returns:
+ A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+ Raises:
+ Nothing
+ """
+ _verify_tabular_data_shape(df, is_column=False)
+ _verify_tabular_data_shape(y, is_column=True)
+ y = _to_series(y)
+ df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
+ if model is None:
+ model = DecisionTreeRegressor()
+ res_df = _prepare_df(df, y, metric, model)
+ res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
+ res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
+ res_df.insert(0, "x", df.columns)
+ if sort:
+ res_df = res_df.sort_values("ppscore", ascending=False)
+ return res_df.reset_index(drop=True)
+
+
+def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
+ """
+ Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
+ Args:
+ df: Input DataFrame containing predictors and the target column.
+ col: Target column name.
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
+ Returns:
+ A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+
+ Raises:
+ Nothing
+ """
+ y = df[[col]]
+ df_pred = df.loc[:, df.columns != col]
+ res = predictors(df_pred, y, metric, model=model, **kwargs)
+ res.insert(1, "y", col)
+ return res
+
+
+def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
+ """
+ Calculates the predictive power score (pps) of every column in df against every other column in df using a given model.
+ Scores will be baselined between 0 and 1 depending on the nature of the problem.
+ Args:
+ df: shape[n, m]
+ metric: Metric to use to score the prediction. Must take in y_true, y_pred.
+ model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
+ crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
+ njobs (optional) [default=1]: Number of jobs to run in parallel.
+ sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
+ sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
+ shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
+ Returns:
+ A DataFrame containing the pps of each predictor in df against every target column.
+ The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
+ Raises:
+ Nothing
+ """
+ _verify_tabular_data_shape(df, is_column=False)
+ df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
+ if model is None:
+ model = DecisionTreeRegressor()
+ res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
+ res_df = pd.concat(res, axis=0)
+ if sort:
+ res_df = res_df.sort_values("ppscore", ascending=False)
+ return res_df.reset_index(drop=True)
+
+
+def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
+ """
+ Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
+ For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations.
+ Continues until no considered pps is over the threshold.
+ Args:
+ matrix: A DataFrame containing the pps of each feature against every other.
+ threshold (optional) [default=0.9]
+ Returns:
+ A list of features that are the most predicted by others
+ Raises:
+ Nothing
+ """
+ features = matrix["x"].unique()
+ cols_predict_count = dict(zip(features, [0]*len(features)))
+ pred_mut = []
+ while True:
+ for _, predict in matrix.iterrows():
+ if predict.y in cols_predict_count and predict.ppscore > threshold:
+ cols_predict_count[predict.y] += predict.ppscore
+ if sum(cols_predict_count.values()) == 0:
+ break
+ else:
+ best_predictor = max(cols_predict_count, key=cols_predict_count.get)
+ pred_mut.append(best_predictor)
+ del cols_predict_count[best_predictor]
+ matrix = matrix[matrix["x"] != best_predictor]
+ matrix = matrix[matrix["y"] != best_predictor]
+ return pred_mut
\ No newline at end of file
diff --git a/microwave/data_analysis/univariate/__init__.py b/microwave/data_analysis/univariate/__init__.py
new file mode 100644
index 0000000..2da107a
--- /dev/null
+++ b/microwave/data_analysis/univariate/__init__.py
@@ -0,0 +1 @@
+from .aggregates import *
\ No newline at end of file
diff --git a/microwave/data_analysis/univariate/aggregates.py b/microwave/data_analysis/univariate/aggregates.py
new file mode 100644
index 0000000..60ea286
--- /dev/null
+++ b/microwave/data_analysis/univariate/aggregates.py
@@ -0,0 +1,236 @@
+from typing import Any, Union, Optional, Callable
+from joblib import Parallel, delayed
+import numpy as np
+import pandas as pd
+import scipy.stats as stats
+
+
+def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
+ """
+ Calculates the mean within a certain percentile range
+ Args:
+ x: The considered ndarray.
+ N1: Lower percentile (between 0 and 1)
+ N2: Upper percentile (between 0 and 1)
+ Returns:
+ The IP-mean
+ Raises:
+ Nothing
+ """
+ p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
+ return np.mean(x[(x >= p1) & (x <= p2)])
+
+
+def median_absolute_deviation(x: np.ndarray) -> float:
+ """
+ Calculates the median of the deviations from the median
+ Args:
+ x: The considered ndarray.
+ Returns:
+ The MAD
+ Raises:
+ Nothing
+ """
+ return np.nanmedian(np.abs(x - np.nanmedian(x)))
+
+
+def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
+ """
+ Calculates the range within a certain percentile range
+ Args:
+ x: The considered ndarray.
+ N1: Lower percentile (between 0 and 1)
+ N2: Upper percentile (between 0 and 1)
+ Returns:
+ The IP-range
+ Raises:
+ Nothing
+ """
+ return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
+
+
+def mode(x: np.ndarray) -> Any:
+ """
+ Calculates the mode of numeric and categorical variables
+ Args:
+ x: The considered ndarray.
+ Returns:
+ The mode
+ Raises:
+ Nothing
+ """
+ if pd.api.types.is_numeric_dtype(x):
+ return stats.mode(x, nan_policy='omit').mode
+ else:
+ return pd.Series(x).mode().iat[0]
+
+
+def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
+ """
+ https://xkcd.com/2435/
+ Args:
+ x: The considered ndarray.
+ iter:
+ Returns:
+ The geothmetic meandian
+ Raises:
+ Nothing
+ """
+ if iter == 0:
+ return x[0]
+ return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
+
+
+def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
+ """
+ Calculates the amount of outliers with the zscore method.
+ Args:
+ x: The considered ndarray.
+ n_sig: Number of standard deviations before being considered an outlier
+ Returns:
+ The number of outliers
+ Raises:
+ Nothing
+ """
+ return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
+
+
+AGGFUNCCODES = {
+ # Counts
+ "size": len,
+ "non-null": lambda x: len(x) - pd.isna(x).sum(),
+ "nunique": lambda x: pd.Series(x).nunique(dropna=True),
+
+ # Basic
+ "sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
+ "min": np.nanmin,
+ "max": np.nanmax,
+ "first": lambda x: x[~pd.isna(x)][0],
+ "last": lambda x: x[~pd.isna(x)][-1],
+
+ # Centricity
+ "mean": np.nanmean,
+ "median": np.nanmedian,
+ "mode": mode,
+ "gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
+ "hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
+ "Pmean": inter_percentile_mean,
+ "geothmetic meandian": geothmetic_meandian,
+
+ # Dispersion
+ "variance": np.nanvar,
+ "std": np.nanstd,
+ "mad": median_absolute_deviation,
+ "skewness": lambda x: stats.skew(x, nan_policy='omit'),
+ "excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
+ "range": lambda x: np.nanmax(x) - np.nanmin(x),
+ "Prange": inter_percentile_range,
+ "n_outliers": get_n_outliers,
+
+ # Percentiles
+ "P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
+ "P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
+ "P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
+ "P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
+ "PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
+
+ # Distribution
+ "skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
+ "kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
+ "normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
+ "jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
+ "shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
+ "anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
+
+ # Other
+ "energy": lambda x: np.nansum(x**2),
+ "rms": lambda x: np.sqrt(np.nanmean(x**2)),
+ "entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
+ "autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
+}
+
+
+def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
+ """
+ Executes a given aggregation function on a given data.
+ If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
+ Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
+ Args:
+ x: The data to execute the function on.
+ agg: The function to execute.
+ Returns:
+ Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
+ Raises:
+ Nothing
+ """
+ ret_names = None
+ if callable(agg):
+ ret = agg(x)
+ elif isinstance(agg, str):
+ ret = AGGFUNCCODES[agg](x)
+ elif isinstance(agg, dict):
+ kwargs = {} if 'kwargs' not in agg else agg['kwargs']
+ if callable(agg['func']):
+ ret = agg['func'](x, **kwargs)
+ elif isinstance(agg['func'], str):
+ ret = AGGFUNCCODES[agg['func']](x, **kwargs)
+ if 'ret_names' in agg:
+ ret_names = agg['ret_names']
+ if isinstance(ret, tuple):
+ if ret_names is None:
+ ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
+ return dict(zip(ret_names, ret))
+ else:
+ return ret
+
+
+def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
+ """
+ Executes a given list of aggregation functions on a given data.
+ Args:
+ x: The data to execute the functions on.
+ aggs: The functions to execute.
+ Returns:
+ A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
+ Raises:
+ Nothing
+ """
+ results = {}
+ for i, func in enumerate(aggs):
+ funcname = f"func_{i}"
+ try:
+ ret = execute_agg_func(x, func)
+ except Exception as e:
+ print(f"agg_{i}", func, e)
+ ret = np.nan
+ if isinstance(func, dict):
+ if 'name' in func:
+ funcname = func['name']
+ elif isinstance(func['func'], str):
+ funcname = func['func']
+ elif isinstance(func, str):
+ funcname = func if func not in results.keys() else f"{func}_{i}"
+ if isinstance(ret, dict):
+ results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
+ else:
+ results[funcname] = ret
+ return results
+
+
+def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
+ """
+ Calculates specified univariate statistics for each column in the DataFrame.
+ Args:
+ df: The input DataFrame.
+ agg: List of aggregation functions to apply.
+ Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
+ n_jobs: number of parallel processes to open. -1 means as many as possible.
+ Returns:
+ pd.DataFrame: DataFrame with one row per specified aggregation.
+ Raises:
+ Nothing
+ """
+ if agg == "all":
+ agg = list(AGGFUNCCODES.keys())
+ results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
+ return pd.DataFrame(results, index=df.columns)
diff --git a/microwave/data_processing/__init__.py b/microwave/data_processing/__init__.py
new file mode 100644
index 0000000..42b6f08
--- /dev/null
+++ b/microwave/data_processing/__init__.py
@@ -0,0 +1,2 @@
+from .df_preprocessing import *
+from .dfTransformer import dfTransformer
\ No newline at end of file
diff --git a/microwave/data_processing/dfTransformer.py b/microwave/data_processing/dfTransformer.py
new file mode 100644
index 0000000..06b9b80
--- /dev/null
+++ b/microwave/data_processing/dfTransformer.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from typing import Optional
+
+
+class dfTransformer(BaseEstimator, TransformerMixin):
+ def __init__(self):
+ self.transforms = []
+
+ def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
+ """
+ Adds a transform specific to a column with optional result column names.
+
+ Args:
+ column_name (str): The name of the column to transform. For dfTransformer, use None.
+ transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
+ result_columns (list of str, optional): List of names for the resulting columns. Default is None.
+ """
+ if not hasattr(transformer, 'transform'):
+ raise ValueError("The transformer must have a 'transform' method.")
+ self.transforms.append((column_name, transformer, result_columns))
+
+ def fit(self, X: pd.DataFrame, y=None):
+ """
+ Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
+
+ Args:
+ X (pd.DataFrame): The DataFrame to fit.
+ y: Ignored.
+
+ Returns:
+ self: Fitted transformer.
+ """
+ for column_name, transformer, _ in self.transforms:
+ if isinstance(transformer, dfTransformer):
+ transformer.fit(X, y)
+ elif column_name in X:
+ transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column
+ return self
+
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+ """
+ Applies all stored transformations to the DataFrame, drops the original columns,
+ and returns the transformed DataFrame.
+
+ Args:
+ X (pd.DataFrame): The DataFrame to transform.
+
+ Returns:
+ pd.DataFrame: The transformed DataFrame.
+ """
+ X_transformed = X.copy()
+
+ for column_name, transformer, result_columns in self.transforms:
+ if isinstance(transformer, dfTransformer):
+ X_transformed = transformer.transform(X_transformed)
+ if column_name in X_transformed:
+ transformed_data = transformer.transform(X_transformed[[column_name]])
+
+ # Check if the transformed data is a DataFrame; if not, convert it
+ if isinstance(transformed_data, pd.DataFrame):
+ transformed_cols = transformed_data
+ else:
+ transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
+ if result_columns:
+ transformed_cols.columns = result_columns
+ else:
+ transformed_cols.columns = [
+ f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
+ ]
+
+ X_transformed.drop(columns=[column_name], inplace=True)
+ X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
+
+ return X_transformed
diff --git a/microwave/data_processing/df_preprocessing.py b/microwave/data_processing/df_preprocessing.py
new file mode 100644
index 0000000..8bd18e0
--- /dev/null
+++ b/microwave/data_processing/df_preprocessing.py
@@ -0,0 +1,75 @@
+import pandas as pd
+from .dfTransformer import dfTransformer
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
+
+
+def _get_encoder(encoding):
+ if encoding == 'onehot':
+ return OneHotEncoder(sparse_output=False)
+ elif encoding == 'label':
+ return LabelEncoder()
+ elif encoding == 'ordinal':
+ return OrdinalEncoder()
+ else:
+ raise ValueError("Unsupported encoding type.")
+
+
+def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
+ if len(ret_shape) == 1:
+ ret_shape.append(1)
+ if hasattr(encoder, "categories_"):
+ colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
+ elif ret_shape[1] == 1:
+ colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
+ elif ret_shape[1] > 1:
+ colnames = [
+ "_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
+ ]
+ return colnames
+
+
+def df_to_numeric(df, encoding='onehot'):
+ """
+ Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
+
+ Args:
+ df (pd.DataFrame): The input DataFrame to process.
+ encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
+
+ Returns:
+ tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
+ """
+ transformer = dfTransformer()
+ X_transformed = df.copy()
+
+ if isinstance(encoding, str):
+ encoder = _get_encoder(encoding)
+ elif callable(encoding):
+ encoder = encoding
+ else:
+ raise ValueError("Encoding must be either a string or a callable transformer.")
+
+ for column in X_transformed.columns:
+ if not pd.api.types.is_numeric_dtype(df[column]):
+ transformed_data = encoder.fit_transform(X_transformed[[column]])
+ result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
+ if isinstance(transformed_data, pd.DataFrame):
+ transformed_cols = transformed_data
+ else:
+ transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
+ transformed_cols.columns = result_columns
+
+ transformer.add_transform(column, encoder, result_columns=result_columns)
+
+ X_transformed.drop(columns=[column], inplace=True)
+ X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
+
+ encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
+ for column in X_transformed.columns:
+ transformed_data = encoder.fit_transform(X_transformed[[column]])
+ transformer.add_transform(column, encoder, result_columns=["column"])
+
+ X_transformed.drop(columns=[column], inplace=True)
+ X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
+
+ return X_transformed, transformer
diff --git a/microwave/math/__init__.py b/microwave/math/__init__.py
new file mode 100644
index 0000000..990c52d
--- /dev/null
+++ b/microwave/math/__init__.py
@@ -0,0 +1,39 @@
+import math
+from typing import Optional, Union
+import numpy as np
+
+
+def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
+ """
+ Calculates the height of a specified gaussian at point x
+ Args:
+ x: point(s) at which to calculate the height
+ mu: The gaussian's mean
+ sig: The gaussian's standard deviation
+ Returns:
+ The height(s), as unique number or ndarray if x is ndarray
+ Raises:
+ Nothing
+ """
+ return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
+
+
+def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
+ """
+ Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
+ Args:
+ mu: The gaussian's mean
+ sig: The gaussian's standard deviation
+ a: lower bound, -inf if None
+ b: upper bound, inf if None
+ Returns:
+ The undefinite integral
+ Raises:
+ Nothing
+ """
+ if sig == 0:
+ if mu >= a and mu < b:
+ return 1
+ else:
+ return 0
+ return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
\ No newline at end of file
diff --git a/microwave/utils/__init__.py b/microwave/utils/__init__.py
new file mode 100644
index 0000000..a82ff3a
--- /dev/null
+++ b/microwave/utils/__init__.py
@@ -0,0 +1,9 @@
+from .arrayutils import _get_shape
+from .arrayutils import _verify_tabular_data_shape
+from .arrayutils import _verify_same_number_of_rows
+from .arrayutils import _sample
+from .arrayutils import sample_rows
+from .arrayutils import nan_rows_mask
+from .arrayutils import _to_series
+from .arrayutils import _is_convertible_to_numpy_array
+from .arrayutils import split_rows
diff --git a/microwave/utils/arrayutils.py b/microwave/utils/arrayutils.py
new file mode 100644
index 0000000..d43f76a
--- /dev/null
+++ b/microwave/utils/arrayutils.py
@@ -0,0 +1,224 @@
+from typing import Any, Union, Optional
+import numpy as np
+import pandas as pd
+
+
+def _get_shape(x: Any) -> tuple:
+ """
+ Returns the shape of a given object
+ Args:
+ x
+ Returns:
+ shape of x
+ Raises:
+ Nothing
+ """
+ if hasattr(x, "shape"):
+ return x.shape
+ elif _is_convertible_to_numpy_array(x):
+ return np.array(x).shape
+ else:
+ return None
+
+
+def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
+ """
+ Verifies that the shape of the given objects are coherent for tabular data.
+ Args:
+ *args: shape[n,m] or [n,]; Tabular data.
+ is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
+ Returns:
+ Nothing
+ Raises:
+ ValueError: If one or more of the given objects is not coherent with tabular data.
+ ValueError: If is_column is true and one or more of the given objects have multiple columns.
+ """
+ for arg in args:
+ shape = _get_shape(arg)
+ if shape is None:
+ raise ValueError(f"Input data has no shape: {arg}.")
+ if len(shape) < 1 or len(shape) > 2:
+ raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
+ if is_column and len(shape) == 2 and 1 not in shape:
+ raise ValueError(f"Input data must be a single column. Has shape {shape}.")
+
+
+def _verify_same_number_of_rows(*args):
+ """
+ Verifies that the the given objects hve the same number of rows.
+ Args:
+ *args: shape[n,m] or [n,]
+ Returns:
+ Nothing
+ Raises:
+ ValueError: If one or more of the given objects has no rows.
+ ValueError: If two objects have different amounts of rows.
+ """
+ n_rows =_get_shape(args[0])[0]
+ for arg in args[1:]:
+ elem_rows = _get_shape(arg)[0]
+ if elem_rows is None:
+ raise ValueError(f"Input data has no shape: {arg}.")
+ elif n_rows != elem_rows:
+ raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
+
+
+def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
+ """
+ Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
+ Args:
+ x: The array or DataFrame to be shuffled.
+ ind_list: The list or array of indices that defines the new order of the rows.
+ Returns:
+ The shuffled array or DataFrame.
+ Raises:
+ TypeError: If the input is neither a numpy array nor a pandas dataframe
+ """
+ if isinstance(x, np.ndarray):
+ return x[ind_list]
+ elif isinstance(x, pd.DataFrame):
+ return x.iloc[ind_list]
+ elif isinstance(x, pd.Series):
+ return x.iloc[ind_list]
+ else:
+ raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
+
+
+def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
+ """
+ Samples rows of the provided objects in the same way and optionally shuffles them.
+ Tries to minimize the amount of rows containing nan.
+ Args:
+ *args: Input tabular data objects.
+ sample: Number of samples to draw from each object. If None, no sampling is done.
+ shuffle: If True and sample is None, shuffles the objects.
+
+ Returns:
+ Tuple of shuffled and/or sampled objects.
+
+ Raises:
+ ValueError: If input objects don't have the same number of rows.
+ """
+ _verify_same_number_of_rows(*args)
+ n_rows = _get_shape(args[0])[0]
+ nan_mask = nan_rows_mask(*args)
+ full_indices = np.where(~nan_mask)[0]
+ n_full_rows = len(full_indices)
+ if sample is not None and sample < n_full_rows:
+ indices = np.random.choice(full_indices, size=sample, replace=False)
+ if not shuffle:
+ indices.sort()
+ elif sample is not None and sample < n_rows:
+ indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
+ if not shuffle:
+ indices.sort()
+ else:
+ indices = np.arange(n_rows)
+ if shuffle:
+ indices = np.random.choice(indices, size=n_rows, replace=False)
+
+ results = tuple(_sample(arg, indices) for arg in args)
+ return results
+
+
+def nan_rows_mask(*args: Any) -> np.ndarray:
+ """
+ Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
+ Args:
+ *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
+ Returns:
+ Boolean mask indicating rows with at least one NaN.
+ Raises:
+ Nothing
+ """
+ # Initialize the mask with False values
+ _verify_same_number_of_rows(*args)
+ n_rows = _get_shape(args[0])[0]
+ mask = np.zeros(n_rows, dtype=bool)
+ for data in args:
+ if isinstance(data, np.ndarray):
+ if data.ndim == 1:
+ data = data.reshape(-1,1)
+ mask |= np.isnan(data).any(axis=1)
+ elif isinstance(data, pd.DataFrame):
+ mask |= data.isna().to_numpy().any(axis=1)
+ elif isinstance(data, pd.Series):
+ mask |= data.isna().to_numpy()
+ else:
+ data = np.array(data)
+ if data.ndim == 1:
+ data = data.reshape(-1,1)
+ mask |= np.isnan(data).any(axis=1)
+ return mask
+
+
+def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
+ """
+ Convert
+ Args:
+ *data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
+ Returns:
+ Boolean mask indicating rows with at least one NaN.
+ Raises:
+ Nothing
+ """
+ if isinstance(data, pd.Series):
+ return data
+ elif isinstance(data, pd.DataFrame):
+ if data.shape[1] != 1:
+ raise ValueError("DataFrame must have exactly one column to convert to Series")
+ return data.iloc[:, 0]
+ elif isinstance(data, np.ndarray):
+ if data.ndim == 1:
+ return pd.Series(data)
+ elif data.ndim == 2 and data.shape[1] == 1:
+ return pd.Series(data.ravel())
+ else:
+ raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
+
+
+def _is_convertible_to_numpy_array(obj: Any) -> bool:
+ """
+ Verifies a given object is convertible to a numpy array without error
+ Args:
+ obj: object to check
+ Returns:
+ bool
+ Raises:
+ Nothing
+ """
+ if isinstance(obj, (list, tuple, dict, set)):
+ return True
+ if np.isscalar(obj):
+ return True
+ if hasattr(obj, '__array__'):
+ return True
+ return False
+
+
+def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
+ """
+ Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
+ Args:
+ data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
+ bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
+ drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
+ Returns:
+ A list of the resulting split pd.DataFrames np.ndarrays.
+ Raises
+ ValueError: If the length of `bool_array` does not match the length of `data`.
+ """
+ if len(bool_array) != len(data):
+ raise ValueError("The length of bool_array must match the length of data.")
+
+ indices = np.where(bool_array)[0]
+ indices = np.concatenate(([0], indices, [len(data)]))
+
+ if isinstance(data, pd.DataFrame):
+ return [data.iloc[start:end].reset_index(drop=drop_index)
+ for start, end in zip(indices[:-1], indices[1:])
+ if start != end]
+ elif isinstance(data, np.ndarray):
+ return [data[start:end]
+ for start, end in zip(indices[:-1], indices[1:])
+ if start != end]
diff --git a/notebooks/demo_ppscore.ipynb b/notebooks/demo_ppscore.ipynb
new file mode 100644
index 0000000..cc7ab04
--- /dev/null
+++ b/notebooks/demo_ppscore.ipynb
@@ -0,0 +1,818 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\IPython\\core\\magics\\osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n",
+ " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
+ ]
+ }
+ ],
+ "source": [
+ "%cd ..\n",
+ "import microwave.data_analysis.ppscore as pps\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "from sklearn.tree import DecisionTreeRegressor\n",
+ "import seaborn as sns\n",
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "