Compare commits
1 Commits
timing_fun
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 29936cb347 |
3
microwave/__init__.py
Normal file
3
microwave/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from . import data_analysis
|
||||||
|
from . import utils
|
||||||
|
from . import math
|
||||||
2
microwave/data_analysis/__init__.py
Normal file
2
microwave/data_analysis/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
from . import ppscore
|
||||||
|
from . import univariate
|
||||||
1
microwave/data_analysis/ppscore/__init__.py
Normal file
1
microwave/data_analysis/ppscore/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .ppscore import *
|
||||||
276
microwave/data_analysis/ppscore/ppscore.py
Normal file
276
microwave/data_analysis/ppscore/ppscore.py
Normal file
@@ -0,0 +1,276 @@
|
|||||||
|
from typing import Union, Callable, Optional, Any
|
||||||
|
from sklearn.metrics import make_scorer
|
||||||
|
from sklearn.tree import DecisionTreeRegressor
|
||||||
|
from sklearn.base import is_classifier, is_regressor
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def _identify_case(model) -> str:
|
||||||
|
"""
|
||||||
|
Identifies if the given model is a classifier or regressor.
|
||||||
|
Args:
|
||||||
|
model: Must be sklearn-compatible and either a regressor of classifier.
|
||||||
|
Returns:
|
||||||
|
"classification" or "regression"
|
||||||
|
Raises:
|
||||||
|
ValueError: If the model cannot be determined to be either a classifier or a regressor
|
||||||
|
"""
|
||||||
|
if is_classifier(model):
|
||||||
|
return "classification"
|
||||||
|
elif is_regressor(model):
|
||||||
|
return "regression"
|
||||||
|
else:
|
||||||
|
raise ValueError("The model cannot be determined to be either a classifier or a regressor")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
|
||||||
|
"""
|
||||||
|
Calculates the expected metric result of a naive model against y.
|
||||||
|
Args:
|
||||||
|
y: shape[n,1]; True values
|
||||||
|
case: "classification" or "regression"
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
Returns:
|
||||||
|
A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
y = _sample(y, ~nan_rows_mask(y))
|
||||||
|
if case == "regression":
|
||||||
|
base = np.full_like(y, np.median(y))
|
||||||
|
elif case == "classification":
|
||||||
|
values, counts = np.unique(y, return_counts=True)
|
||||||
|
ind = np.argmax(counts)
|
||||||
|
base = np.full_like(y, values[ind])
|
||||||
|
return metric(y, base)
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculates the base information depending on the model, metric and true values.
|
||||||
|
Args:
|
||||||
|
x: shape[n, m]; Predictors (features).
|
||||||
|
y: shape[n, 1]; True values (targets).
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
||||||
|
Returns:
|
||||||
|
A DataFrame containing the pps informations, including:
|
||||||
|
- ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
|
||||||
|
- case: The type of model ("classifier" or "regressor").
|
||||||
|
- metric: The name of the metric used.
|
||||||
|
- perfect_score: The score when the model's predictions are perfect.
|
||||||
|
- naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
|
||||||
|
- model_score: Placeholder for the model score, initialized to NaN.
|
||||||
|
- model: The type of the model.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
y = _sample(y, ~nan_rows_mask(y))
|
||||||
|
case = _identify_case(model)
|
||||||
|
baseline_score = _get_baseline_score(y, case, metric)
|
||||||
|
perfect_score = metric(y, y)
|
||||||
|
return pd.DataFrame({
|
||||||
|
"ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
|
||||||
|
"case": case,
|
||||||
|
"metric": metric.__name__,
|
||||||
|
"perfect_score": perfect_score,
|
||||||
|
"naive_score": baseline_score,
|
||||||
|
"model_score": np.nan,
|
||||||
|
"model": type(model).__name__
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
|
||||||
|
"""
|
||||||
|
Returns the score according to the given metric of a fitted model
|
||||||
|
Args:
|
||||||
|
x: shape[n, m]; Predictors (features).
|
||||||
|
y: shape[n,1]; True values (targets).
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
|
||||||
|
metric_params (optional): Additional parameters to pass to the metric function.
|
||||||
|
crossvals (optional) [default=5]: Number of cross-validations to perform.
|
||||||
|
Returns:
|
||||||
|
The evaluation metric on the prediction of the model
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if model is None:
|
||||||
|
model = DecisionTreeRegressor()
|
||||||
|
nan_mask = nan_rows_mask(x, y)
|
||||||
|
scores = cross_val_score(
|
||||||
|
model,
|
||||||
|
np.array(_sample(x, ~nan_mask)).reshape(-1, 1),
|
||||||
|
np.array(_sample(y, ~nan_mask)).reshape(-1, 1),
|
||||||
|
cv=crossvals,
|
||||||
|
scoring=make_scorer(metric, **metric_params)
|
||||||
|
)
|
||||||
|
return scores.mean()
|
||||||
|
|
||||||
|
|
||||||
|
def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series],
|
||||||
|
naive_score: Union[int, float, np.ndarray, pd.Series],
|
||||||
|
perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
|
||||||
|
"""
|
||||||
|
Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
|
||||||
|
Args:
|
||||||
|
score: The actual score(s).
|
||||||
|
naive_score: The naive score(s).
|
||||||
|
perfect_score: The perfect score(s).
|
||||||
|
Returns:
|
||||||
|
The predictive power score(s).
|
||||||
|
"""
|
||||||
|
score = np.asarray(score)
|
||||||
|
naive_score = np.asarray(naive_score)
|
||||||
|
perfect_score = np.asarray(perfect_score)
|
||||||
|
pps = (score - naive_score) / (perfect_score - naive_score)
|
||||||
|
pps = np.where(pps <= 0, 0, pps)
|
||||||
|
if isinstance(score, pd.Series):
|
||||||
|
return pd.Series(pps, index=score.index)
|
||||||
|
return pps
|
||||||
|
|
||||||
|
|
||||||
|
def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
||||||
|
Args:
|
||||||
|
x: shape[n,1] or [n,]; Predictors (features).
|
||||||
|
y: shape[n,1] or [n,]; True values (targets).
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
|
||||||
|
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
||||||
|
sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
|
||||||
|
shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
|
||||||
|
crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
|
||||||
|
Returns:
|
||||||
|
The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
_verify_tabular_data_shape(x, y, is_column=True)
|
||||||
|
x, y = _to_series(x), _to_series(y)
|
||||||
|
x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
|
||||||
|
if model is None:
|
||||||
|
model = DecisionTreeRegressor()
|
||||||
|
res_df = _prepare_df(x, y, metric, model)
|
||||||
|
res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
|
||||||
|
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
||||||
|
return res_df
|
||||||
|
|
||||||
|
|
||||||
|
def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
|
||||||
|
"""
|
||||||
|
Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
||||||
|
Args:
|
||||||
|
df: shape[n, m]; Predictors (features).
|
||||||
|
y: shape[n, 1]; True values (targets).
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
||||||
|
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
||||||
|
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
||||||
|
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
||||||
|
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
||||||
|
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
||||||
|
Returns:
|
||||||
|
A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
_verify_tabular_data_shape(df, is_column=False)
|
||||||
|
_verify_tabular_data_shape(y, is_column=True)
|
||||||
|
y = _to_series(y)
|
||||||
|
df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
|
||||||
|
if model is None:
|
||||||
|
model = DecisionTreeRegressor()
|
||||||
|
res_df = _prepare_df(df, y, metric, model)
|
||||||
|
res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
|
||||||
|
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
||||||
|
res_df.insert(0, "x", df.columns)
|
||||||
|
if sort:
|
||||||
|
res_df = res_df.sort_values("ppscore", ascending=False)
|
||||||
|
return res_df.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
|
||||||
|
Args:
|
||||||
|
df: Input DataFrame containing predictors and the target column.
|
||||||
|
col: Target column name.
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
|
||||||
|
Returns:
|
||||||
|
A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
y = df[[col]]
|
||||||
|
df_pred = df.loc[:, df.columns != col]
|
||||||
|
res = predictors(df_pred, y, metric, model=model, **kwargs)
|
||||||
|
res.insert(1, "y", col)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculates the predictive power score (pps) of every column in df against every other column in df using a given model.
|
||||||
|
Scores will be baselined between 0 and 1 depending on the nature of the problem.
|
||||||
|
Args:
|
||||||
|
df: shape[n, m]
|
||||||
|
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||||
|
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
||||||
|
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
||||||
|
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
||||||
|
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
||||||
|
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
||||||
|
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
||||||
|
Returns:
|
||||||
|
A DataFrame containing the pps of each predictor in df against every target column.
|
||||||
|
The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
_verify_tabular_data_shape(df, is_column=False)
|
||||||
|
df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
|
||||||
|
if model is None:
|
||||||
|
model = DecisionTreeRegressor()
|
||||||
|
res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
|
||||||
|
res_df = pd.concat(res, axis=0)
|
||||||
|
if sort:
|
||||||
|
res_df = res_df.sort_values("ppscore", ascending=False)
|
||||||
|
return res_df.reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
|
||||||
|
"""
|
||||||
|
Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
|
||||||
|
For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations.
|
||||||
|
Continues until no considered pps is over the threshold.
|
||||||
|
Args:
|
||||||
|
matrix: A DataFrame containing the pps of each feature against every other.
|
||||||
|
threshold (optional) [default=0.9]
|
||||||
|
Returns:
|
||||||
|
A list of features that are the most predicted by others
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
features = matrix["x"].unique()
|
||||||
|
cols_predict_count = dict(zip(features, [0]*len(features)))
|
||||||
|
pred_mut = []
|
||||||
|
while True:
|
||||||
|
for _, predict in matrix.iterrows():
|
||||||
|
if predict.y in cols_predict_count and predict.ppscore > threshold:
|
||||||
|
cols_predict_count[predict.y] += predict.ppscore
|
||||||
|
if sum(cols_predict_count.values()) == 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
best_predictor = max(cols_predict_count, key=cols_predict_count.get)
|
||||||
|
pred_mut.append(best_predictor)
|
||||||
|
del cols_predict_count[best_predictor]
|
||||||
|
matrix = matrix[matrix["x"] != best_predictor]
|
||||||
|
matrix = matrix[matrix["y"] != best_predictor]
|
||||||
|
return pred_mut
|
||||||
1
microwave/data_analysis/univariate/__init__.py
Normal file
1
microwave/data_analysis/univariate/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .aggregates import *
|
||||||
236
microwave/data_analysis/univariate/aggregates.py
Normal file
236
microwave/data_analysis/univariate/aggregates.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
from typing import Any, Union, Optional, Callable
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import scipy.stats as stats
|
||||||
|
|
||||||
|
|
||||||
|
def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
||||||
|
"""
|
||||||
|
Calculates the mean within a certain percentile range
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
N1: Lower percentile (between 0 and 1)
|
||||||
|
N2: Upper percentile (between 0 and 1)
|
||||||
|
Returns:
|
||||||
|
The IP-mean
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
|
||||||
|
return np.mean(x[(x >= p1) & (x <= p2)])
|
||||||
|
|
||||||
|
|
||||||
|
def median_absolute_deviation(x: np.ndarray) -> float:
|
||||||
|
"""
|
||||||
|
Calculates the median of the deviations from the median
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
Returns:
|
||||||
|
The MAD
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
return np.nanmedian(np.abs(x - np.nanmedian(x)))
|
||||||
|
|
||||||
|
|
||||||
|
def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
||||||
|
"""
|
||||||
|
Calculates the range within a certain percentile range
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
N1: Lower percentile (between 0 and 1)
|
||||||
|
N2: Upper percentile (between 0 and 1)
|
||||||
|
Returns:
|
||||||
|
The IP-range
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
|
||||||
|
|
||||||
|
|
||||||
|
def mode(x: np.ndarray) -> Any:
|
||||||
|
"""
|
||||||
|
Calculates the mode of numeric and categorical variables
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
Returns:
|
||||||
|
The mode
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if pd.api.types.is_numeric_dtype(x):
|
||||||
|
return stats.mode(x, nan_policy='omit').mode
|
||||||
|
else:
|
||||||
|
return pd.Series(x).mode().iat[0]
|
||||||
|
|
||||||
|
|
||||||
|
def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
|
||||||
|
"""
|
||||||
|
https://xkcd.com/2435/
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
iter:
|
||||||
|
Returns:
|
||||||
|
The geothmetic meandian
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if iter == 0:
|
||||||
|
return x[0]
|
||||||
|
return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
|
||||||
|
|
||||||
|
|
||||||
|
def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
|
||||||
|
"""
|
||||||
|
Calculates the amount of outliers with the zscore method.
|
||||||
|
Args:
|
||||||
|
x: The considered ndarray.
|
||||||
|
n_sig: Number of standard deviations before being considered an outlier
|
||||||
|
Returns:
|
||||||
|
The number of outliers
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
|
||||||
|
|
||||||
|
|
||||||
|
AGGFUNCCODES = {
|
||||||
|
# Counts
|
||||||
|
"size": len,
|
||||||
|
"non-null": lambda x: len(x) - pd.isna(x).sum(),
|
||||||
|
"nunique": lambda x: pd.Series(x).nunique(dropna=True),
|
||||||
|
|
||||||
|
# Basic
|
||||||
|
"sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
|
||||||
|
"min": np.nanmin,
|
||||||
|
"max": np.nanmax,
|
||||||
|
"first": lambda x: x[~pd.isna(x)][0],
|
||||||
|
"last": lambda x: x[~pd.isna(x)][-1],
|
||||||
|
|
||||||
|
# Centricity
|
||||||
|
"mean": np.nanmean,
|
||||||
|
"median": np.nanmedian,
|
||||||
|
"mode": mode,
|
||||||
|
"gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
|
||||||
|
"hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
|
||||||
|
"Pmean": inter_percentile_mean,
|
||||||
|
"geothmetic meandian": geothmetic_meandian,
|
||||||
|
|
||||||
|
# Dispersion
|
||||||
|
"variance": np.nanvar,
|
||||||
|
"std": np.nanstd,
|
||||||
|
"mad": median_absolute_deviation,
|
||||||
|
"skewness": lambda x: stats.skew(x, nan_policy='omit'),
|
||||||
|
"excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
|
||||||
|
"range": lambda x: np.nanmax(x) - np.nanmin(x),
|
||||||
|
"Prange": inter_percentile_range,
|
||||||
|
"n_outliers": get_n_outliers,
|
||||||
|
|
||||||
|
# Percentiles
|
||||||
|
"P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
|
||||||
|
"P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
|
||||||
|
"P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
|
||||||
|
"P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
|
||||||
|
"PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
|
||||||
|
|
||||||
|
# Distribution
|
||||||
|
"skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
|
||||||
|
"kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
|
||||||
|
"normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
|
||||||
|
"jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
|
||||||
|
"shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
|
||||||
|
"anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
|
||||||
|
|
||||||
|
# Other
|
||||||
|
"energy": lambda x: np.nansum(x**2),
|
||||||
|
"rms": lambda x: np.sqrt(np.nanmean(x**2)),
|
||||||
|
"entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
|
||||||
|
"autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
|
||||||
|
"""
|
||||||
|
Executes a given aggregation function on a given data.
|
||||||
|
If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
|
||||||
|
Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
|
||||||
|
Args:
|
||||||
|
x: The data to execute the function on.
|
||||||
|
agg: The function to execute.
|
||||||
|
Returns:
|
||||||
|
Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
ret_names = None
|
||||||
|
if callable(agg):
|
||||||
|
ret = agg(x)
|
||||||
|
elif isinstance(agg, str):
|
||||||
|
ret = AGGFUNCCODES[agg](x)
|
||||||
|
elif isinstance(agg, dict):
|
||||||
|
kwargs = {} if 'kwargs' not in agg else agg['kwargs']
|
||||||
|
if callable(agg['func']):
|
||||||
|
ret = agg['func'](x, **kwargs)
|
||||||
|
elif isinstance(agg['func'], str):
|
||||||
|
ret = AGGFUNCCODES[agg['func']](x, **kwargs)
|
||||||
|
if 'ret_names' in agg:
|
||||||
|
ret_names = agg['ret_names']
|
||||||
|
if isinstance(ret, tuple):
|
||||||
|
if ret_names is None:
|
||||||
|
ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
|
||||||
|
return dict(zip(ret_names, ret))
|
||||||
|
else:
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Executes a given list of aggregation functions on a given data.
|
||||||
|
Args:
|
||||||
|
x: The data to execute the functions on.
|
||||||
|
aggs: The functions to execute.
|
||||||
|
Returns:
|
||||||
|
A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
for i, func in enumerate(aggs):
|
||||||
|
funcname = f"func_{i}"
|
||||||
|
try:
|
||||||
|
ret = execute_agg_func(x, func)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"agg_{i}", func, e)
|
||||||
|
ret = np.nan
|
||||||
|
if isinstance(func, dict):
|
||||||
|
if 'name' in func:
|
||||||
|
funcname = func['name']
|
||||||
|
elif isinstance(func['func'], str):
|
||||||
|
funcname = func['func']
|
||||||
|
elif isinstance(func, str):
|
||||||
|
funcname = func if func not in results.keys() else f"{func}_{i}"
|
||||||
|
if isinstance(ret, dict):
|
||||||
|
results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
|
||||||
|
else:
|
||||||
|
results[funcname] = ret
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Calculates specified univariate statistics for each column in the DataFrame.
|
||||||
|
Args:
|
||||||
|
df: The input DataFrame.
|
||||||
|
agg: List of aggregation functions to apply.
|
||||||
|
Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
|
||||||
|
n_jobs: number of parallel processes to open. -1 means as many as possible.
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame with one row per specified aggregation.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if agg == "all":
|
||||||
|
agg = list(AGGFUNCCODES.keys())
|
||||||
|
results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
|
||||||
|
return pd.DataFrame(results, index=df.columns)
|
||||||
2
microwave/data_processing/__init__.py
Normal file
2
microwave/data_processing/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
from .df_preprocessing import *
|
||||||
|
from .dfTransformer import dfTransformer
|
||||||
75
microwave/data_processing/dfTransformer.py
Normal file
75
microwave/data_processing/dfTransformer.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
class dfTransformer(BaseEstimator, TransformerMixin):
|
||||||
|
def __init__(self):
|
||||||
|
self.transforms = []
|
||||||
|
|
||||||
|
def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
|
||||||
|
"""
|
||||||
|
Adds a transform specific to a column with optional result column names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
column_name (str): The name of the column to transform. For dfTransformer, use None.
|
||||||
|
transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
|
||||||
|
result_columns (list of str, optional): List of names for the resulting columns. Default is None.
|
||||||
|
"""
|
||||||
|
if not hasattr(transformer, 'transform'):
|
||||||
|
raise ValueError("The transformer must have a 'transform' method.")
|
||||||
|
self.transforms.append((column_name, transformer, result_columns))
|
||||||
|
|
||||||
|
def fit(self, X: pd.DataFrame, y=None):
|
||||||
|
"""
|
||||||
|
Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X (pd.DataFrame): The DataFrame to fit.
|
||||||
|
y: Ignored.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
self: Fitted transformer.
|
||||||
|
"""
|
||||||
|
for column_name, transformer, _ in self.transforms:
|
||||||
|
if isinstance(transformer, dfTransformer):
|
||||||
|
transformer.fit(X, y)
|
||||||
|
elif column_name in X:
|
||||||
|
transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column
|
||||||
|
return self
|
||||||
|
|
||||||
|
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Applies all stored transformations to the DataFrame, drops the original columns,
|
||||||
|
and returns the transformed DataFrame.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
X (pd.DataFrame): The DataFrame to transform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: The transformed DataFrame.
|
||||||
|
"""
|
||||||
|
X_transformed = X.copy()
|
||||||
|
|
||||||
|
for column_name, transformer, result_columns in self.transforms:
|
||||||
|
if isinstance(transformer, dfTransformer):
|
||||||
|
X_transformed = transformer.transform(X_transformed)
|
||||||
|
if column_name in X_transformed:
|
||||||
|
transformed_data = transformer.transform(X_transformed[[column_name]])
|
||||||
|
|
||||||
|
# Check if the transformed data is a DataFrame; if not, convert it
|
||||||
|
if isinstance(transformed_data, pd.DataFrame):
|
||||||
|
transformed_cols = transformed_data
|
||||||
|
else:
|
||||||
|
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
||||||
|
if result_columns:
|
||||||
|
transformed_cols.columns = result_columns
|
||||||
|
else:
|
||||||
|
transformed_cols.columns = [
|
||||||
|
f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
|
||||||
|
]
|
||||||
|
|
||||||
|
X_transformed.drop(columns=[column_name], inplace=True)
|
||||||
|
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
||||||
|
|
||||||
|
return X_transformed
|
||||||
75
microwave/data_processing/df_preprocessing.py
Normal file
75
microwave/data_processing/df_preprocessing.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from .dfTransformer import dfTransformer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
|
||||||
|
|
||||||
|
|
||||||
|
def _get_encoder(encoding):
|
||||||
|
if encoding == 'onehot':
|
||||||
|
return OneHotEncoder(sparse_output=False)
|
||||||
|
elif encoding == 'label':
|
||||||
|
return LabelEncoder()
|
||||||
|
elif encoding == 'ordinal':
|
||||||
|
return OrdinalEncoder()
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported encoding type.")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
|
||||||
|
if len(ret_shape) == 1:
|
||||||
|
ret_shape.append(1)
|
||||||
|
if hasattr(encoder, "categories_"):
|
||||||
|
colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
|
||||||
|
elif ret_shape[1] == 1:
|
||||||
|
colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
|
||||||
|
elif ret_shape[1] > 1:
|
||||||
|
colnames = [
|
||||||
|
"_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
|
||||||
|
]
|
||||||
|
return colnames
|
||||||
|
|
||||||
|
|
||||||
|
def df_to_numeric(df, encoding='onehot'):
|
||||||
|
"""
|
||||||
|
Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pd.DataFrame): The input DataFrame to process.
|
||||||
|
encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
|
||||||
|
"""
|
||||||
|
transformer = dfTransformer()
|
||||||
|
X_transformed = df.copy()
|
||||||
|
|
||||||
|
if isinstance(encoding, str):
|
||||||
|
encoder = _get_encoder(encoding)
|
||||||
|
elif callable(encoding):
|
||||||
|
encoder = encoding
|
||||||
|
else:
|
||||||
|
raise ValueError("Encoding must be either a string or a callable transformer.")
|
||||||
|
|
||||||
|
for column in X_transformed.columns:
|
||||||
|
if not pd.api.types.is_numeric_dtype(df[column]):
|
||||||
|
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
||||||
|
result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
|
||||||
|
if isinstance(transformed_data, pd.DataFrame):
|
||||||
|
transformed_cols = transformed_data
|
||||||
|
else:
|
||||||
|
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
||||||
|
transformed_cols.columns = result_columns
|
||||||
|
|
||||||
|
transformer.add_transform(column, encoder, result_columns=result_columns)
|
||||||
|
|
||||||
|
X_transformed.drop(columns=[column], inplace=True)
|
||||||
|
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
||||||
|
|
||||||
|
encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
|
||||||
|
for column in X_transformed.columns:
|
||||||
|
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
||||||
|
transformer.add_transform(column, encoder, result_columns=["column"])
|
||||||
|
|
||||||
|
X_transformed.drop(columns=[column], inplace=True)
|
||||||
|
X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
|
||||||
|
|
||||||
|
return X_transformed, transformer
|
||||||
39
microwave/math/__init__.py
Normal file
39
microwave/math/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import math
|
||||||
|
from typing import Optional, Union
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
|
||||||
|
"""
|
||||||
|
Calculates the height of a specified gaussian at point x
|
||||||
|
Args:
|
||||||
|
x: point(s) at which to calculate the height
|
||||||
|
mu: The gaussian's mean
|
||||||
|
sig: The gaussian's standard deviation
|
||||||
|
Returns:
|
||||||
|
The height(s), as unique number or ndarray if x is ndarray
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
|
||||||
|
|
||||||
|
|
||||||
|
def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
|
||||||
|
"""
|
||||||
|
Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
|
||||||
|
Args:
|
||||||
|
mu: The gaussian's mean
|
||||||
|
sig: The gaussian's standard deviation
|
||||||
|
a: lower bound, -inf if None
|
||||||
|
b: upper bound, inf if None
|
||||||
|
Returns:
|
||||||
|
The undefinite integral
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if sig == 0:
|
||||||
|
if mu >= a and mu < b:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
|
||||||
9
microwave/utils/__init__.py
Normal file
9
microwave/utils/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from .arrayutils import _get_shape
|
||||||
|
from .arrayutils import _verify_tabular_data_shape
|
||||||
|
from .arrayutils import _verify_same_number_of_rows
|
||||||
|
from .arrayutils import _sample
|
||||||
|
from .arrayutils import sample_rows
|
||||||
|
from .arrayutils import nan_rows_mask
|
||||||
|
from .arrayutils import _to_series
|
||||||
|
from .arrayutils import _is_convertible_to_numpy_array
|
||||||
|
from .arrayutils import split_rows
|
||||||
224
microwave/utils/arrayutils.py
Normal file
224
microwave/utils/arrayutils.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
from typing import Any, Union, Optional
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def _get_shape(x: Any) -> tuple:
|
||||||
|
"""
|
||||||
|
Returns the shape of a given object
|
||||||
|
Args:
|
||||||
|
x
|
||||||
|
Returns:
|
||||||
|
shape of x
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if hasattr(x, "shape"):
|
||||||
|
return x.shape
|
||||||
|
elif _is_convertible_to_numpy_array(x):
|
||||||
|
return np.array(x).shape
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
|
||||||
|
"""
|
||||||
|
Verifies that the shape of the given objects are coherent for tabular data.
|
||||||
|
Args:
|
||||||
|
*args: shape[n,m] or [n,]; Tabular data.
|
||||||
|
is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
|
||||||
|
Returns:
|
||||||
|
Nothing
|
||||||
|
Raises:
|
||||||
|
ValueError: If one or more of the given objects is not coherent with tabular data.
|
||||||
|
ValueError: If is_column is true and one or more of the given objects have multiple columns.
|
||||||
|
"""
|
||||||
|
for arg in args:
|
||||||
|
shape = _get_shape(arg)
|
||||||
|
if shape is None:
|
||||||
|
raise ValueError(f"Input data has no shape: {arg}.")
|
||||||
|
if len(shape) < 1 or len(shape) > 2:
|
||||||
|
raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
|
||||||
|
if is_column and len(shape) == 2 and 1 not in shape:
|
||||||
|
raise ValueError(f"Input data must be a single column. Has shape {shape}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_same_number_of_rows(*args):
|
||||||
|
"""
|
||||||
|
Verifies that the the given objects hve the same number of rows.
|
||||||
|
Args:
|
||||||
|
*args: shape[n,m] or [n,]
|
||||||
|
Returns:
|
||||||
|
Nothing
|
||||||
|
Raises:
|
||||||
|
ValueError: If one or more of the given objects has no rows.
|
||||||
|
ValueError: If two objects have different amounts of rows.
|
||||||
|
"""
|
||||||
|
n_rows =_get_shape(args[0])[0]
|
||||||
|
for arg in args[1:]:
|
||||||
|
elem_rows = _get_shape(arg)[0]
|
||||||
|
if elem_rows is None:
|
||||||
|
raise ValueError(f"Input data has no shape: {arg}.")
|
||||||
|
elif n_rows != elem_rows:
|
||||||
|
raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
|
||||||
|
|
||||||
|
|
||||||
|
def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
|
||||||
|
"""
|
||||||
|
Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
|
||||||
|
Args:
|
||||||
|
x: The array or DataFrame to be shuffled.
|
||||||
|
ind_list: The list or array of indices that defines the new order of the rows.
|
||||||
|
Returns:
|
||||||
|
The shuffled array or DataFrame.
|
||||||
|
Raises:
|
||||||
|
TypeError: If the input is neither a numpy array nor a pandas dataframe
|
||||||
|
"""
|
||||||
|
if isinstance(x, np.ndarray):
|
||||||
|
return x[ind_list]
|
||||||
|
elif isinstance(x, pd.DataFrame):
|
||||||
|
return x.iloc[ind_list]
|
||||||
|
elif isinstance(x, pd.Series):
|
||||||
|
return x.iloc[ind_list]
|
||||||
|
else:
|
||||||
|
raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
|
||||||
|
|
||||||
|
|
||||||
|
def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
|
||||||
|
"""
|
||||||
|
Samples rows of the provided objects in the same way and optionally shuffles them.
|
||||||
|
Tries to minimize the amount of rows containing nan.
|
||||||
|
Args:
|
||||||
|
*args: Input tabular data objects.
|
||||||
|
sample: Number of samples to draw from each object. If None, no sampling is done.
|
||||||
|
shuffle: If True and sample is None, shuffles the objects.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of shuffled and/or sampled objects.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If input objects don't have the same number of rows.
|
||||||
|
"""
|
||||||
|
_verify_same_number_of_rows(*args)
|
||||||
|
n_rows = _get_shape(args[0])[0]
|
||||||
|
nan_mask = nan_rows_mask(*args)
|
||||||
|
full_indices = np.where(~nan_mask)[0]
|
||||||
|
n_full_rows = len(full_indices)
|
||||||
|
if sample is not None and sample < n_full_rows:
|
||||||
|
indices = np.random.choice(full_indices, size=sample, replace=False)
|
||||||
|
if not shuffle:
|
||||||
|
indices.sort()
|
||||||
|
elif sample is not None and sample < n_rows:
|
||||||
|
indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
|
||||||
|
if not shuffle:
|
||||||
|
indices.sort()
|
||||||
|
else:
|
||||||
|
indices = np.arange(n_rows)
|
||||||
|
if shuffle:
|
||||||
|
indices = np.random.choice(indices, size=n_rows, replace=False)
|
||||||
|
|
||||||
|
results = tuple(_sample(arg, indices) for arg in args)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def nan_rows_mask(*args: Any) -> np.ndarray:
|
||||||
|
"""
|
||||||
|
Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
|
||||||
|
Args:
|
||||||
|
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
||||||
|
Returns:
|
||||||
|
Boolean mask indicating rows with at least one NaN.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
# Initialize the mask with False values
|
||||||
|
_verify_same_number_of_rows(*args)
|
||||||
|
n_rows = _get_shape(args[0])[0]
|
||||||
|
mask = np.zeros(n_rows, dtype=bool)
|
||||||
|
for data in args:
|
||||||
|
if isinstance(data, np.ndarray):
|
||||||
|
if data.ndim == 1:
|
||||||
|
data = data.reshape(-1,1)
|
||||||
|
mask |= np.isnan(data).any(axis=1)
|
||||||
|
elif isinstance(data, pd.DataFrame):
|
||||||
|
mask |= data.isna().to_numpy().any(axis=1)
|
||||||
|
elif isinstance(data, pd.Series):
|
||||||
|
mask |= data.isna().to_numpy()
|
||||||
|
else:
|
||||||
|
data = np.array(data)
|
||||||
|
if data.ndim == 1:
|
||||||
|
data = data.reshape(-1,1)
|
||||||
|
mask |= np.isnan(data).any(axis=1)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
|
||||||
|
"""
|
||||||
|
Convert
|
||||||
|
Args:
|
||||||
|
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
||||||
|
Returns:
|
||||||
|
Boolean mask indicating rows with at least one NaN.
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if isinstance(data, pd.Series):
|
||||||
|
return data
|
||||||
|
elif isinstance(data, pd.DataFrame):
|
||||||
|
if data.shape[1] != 1:
|
||||||
|
raise ValueError("DataFrame must have exactly one column to convert to Series")
|
||||||
|
return data.iloc[:, 0]
|
||||||
|
elif isinstance(data, np.ndarray):
|
||||||
|
if data.ndim == 1:
|
||||||
|
return pd.Series(data)
|
||||||
|
elif data.ndim == 2 and data.shape[1] == 1:
|
||||||
|
return pd.Series(data.ravel())
|
||||||
|
else:
|
||||||
|
raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_convertible_to_numpy_array(obj: Any) -> bool:
|
||||||
|
"""
|
||||||
|
Verifies a given object is convertible to a numpy array without error
|
||||||
|
Args:
|
||||||
|
obj: object to check
|
||||||
|
Returns:
|
||||||
|
bool
|
||||||
|
Raises:
|
||||||
|
Nothing
|
||||||
|
"""
|
||||||
|
if isinstance(obj, (list, tuple, dict, set)):
|
||||||
|
return True
|
||||||
|
if np.isscalar(obj):
|
||||||
|
return True
|
||||||
|
if hasattr(obj, '__array__'):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
|
||||||
|
"""
|
||||||
|
Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
|
||||||
|
Args:
|
||||||
|
data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
|
||||||
|
bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
|
||||||
|
drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
|
||||||
|
Returns:
|
||||||
|
A list of the resulting split pd.DataFrames np.ndarrays.
|
||||||
|
Raises
|
||||||
|
ValueError: If the length of `bool_array` does not match the length of `data`.
|
||||||
|
"""
|
||||||
|
if len(bool_array) != len(data):
|
||||||
|
raise ValueError("The length of bool_array must match the length of data.")
|
||||||
|
|
||||||
|
indices = np.where(bool_array)[0]
|
||||||
|
indices = np.concatenate(([0], indices, [len(data)]))
|
||||||
|
|
||||||
|
if isinstance(data, pd.DataFrame):
|
||||||
|
return [data.iloc[start:end].reset_index(drop=drop_index)
|
||||||
|
for start, end in zip(indices[:-1], indices[1:])
|
||||||
|
if start != end]
|
||||||
|
elif isinstance(data, np.ndarray):
|
||||||
|
return [data[start:end]
|
||||||
|
for start, end in zip(indices[:-1], indices[1:])
|
||||||
|
if start != end]
|
||||||
818
notebooks/demo_ppscore.ipynb
Normal file
818
notebooks/demo_ppscore.ipynb
Normal file
File diff suppressed because one or more lines are too long
725
notebooks/demo_processing.ipynb
Normal file
725
notebooks/demo_processing.ipynb
Normal file
@@ -0,0 +1,725 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import string"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>A</th>\n",
|
||||||
|
" <th>B</th>\n",
|
||||||
|
" <th>C</th>\n",
|
||||||
|
" <th>D</th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>b</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>c</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>d</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>d</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>c</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>d</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>a</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>a</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>a</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>b</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>c</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>11</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>a</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>12</th>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>a</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>13</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>c</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>14</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>b</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" A B C D 0\n",
|
||||||
|
"0 0 2 2 2 b\n",
|
||||||
|
"1 1 2 1 1 c\n",
|
||||||
|
"2 1 0 1 1 d\n",
|
||||||
|
"3 0 0 1 0 d\n",
|
||||||
|
"4 2 1 2 2 c\n",
|
||||||
|
"5 0 0 0 0 d\n",
|
||||||
|
"6 0 2 2 2 a\n",
|
||||||
|
"7 0 2 0 0 a\n",
|
||||||
|
"8 0 1 0 0 a\n",
|
||||||
|
"9 0 2 2 1 b\n",
|
||||||
|
"10 2 2 0 1 c\n",
|
||||||
|
"11 2 1 1 1 a\n",
|
||||||
|
"12 0 1 0 2 a\n",
|
||||||
|
"13 2 1 0 1 c\n",
|
||||||
|
"14 1 0 0 1 b"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
|
||||||
|
"df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
|
||||||
|
"df"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%cd ..\n",
|
||||||
|
"import microwave.data_processing as dp"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>A</th>\n",
|
||||||
|
" <th>B</th>\n",
|
||||||
|
" <th>C</th>\n",
|
||||||
|
" <th>D</th>\n",
|
||||||
|
" <th>0_encoded_a</th>\n",
|
||||||
|
" <th>0_encoded_b</th>\n",
|
||||||
|
" <th>0_encoded_c</th>\n",
|
||||||
|
" <th>0_encoded_d</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>11</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>12</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>13</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>14</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n",
|
||||||
|
"0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n",
|
||||||
|
"1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||||
|
"2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n",
|
||||||
|
"3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n",
|
||||||
|
"4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n",
|
||||||
|
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n",
|
||||||
|
"6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n",
|
||||||
|
"10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||||
|
"11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||||
|
"14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"new_df, trans = dp.df_to_numeric(df)\n",
|
||||||
|
"new_df"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[(0,\n",
|
||||||
|
" OneHotEncoder(sparse_output=False),\n",
|
||||||
|
" ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
|
||||||
|
" ('A',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('B',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('C',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('D',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('0_encoded_a',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('0_encoded_b',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('0_encoded_c',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column']),\n",
|
||||||
|
" ('0_encoded_d',\n",
|
||||||
|
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||||
|
" ['column'])]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"trans.transforms"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
|
||||||
|
" y = column_or_1d(y, warn=True)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>A</th>\n",
|
||||||
|
" <th>B</th>\n",
|
||||||
|
" <th>C</th>\n",
|
||||||
|
" <th>D</th>\n",
|
||||||
|
" <th>0_encoded</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>3.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>11</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>12</th>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>13</th>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>2.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>14</th>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" <td>1.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" A B C D 0_encoded\n",
|
||||||
|
"0 0.0 2.0 2.0 2.0 1.0\n",
|
||||||
|
"1 1.0 2.0 1.0 1.0 2.0\n",
|
||||||
|
"2 1.0 0.0 1.0 1.0 3.0\n",
|
||||||
|
"3 0.0 0.0 1.0 0.0 3.0\n",
|
||||||
|
"4 2.0 1.0 2.0 2.0 2.0\n",
|
||||||
|
"5 0.0 0.0 0.0 0.0 3.0\n",
|
||||||
|
"6 0.0 2.0 2.0 2.0 0.0\n",
|
||||||
|
"7 0.0 2.0 0.0 0.0 0.0\n",
|
||||||
|
"8 0.0 1.0 0.0 0.0 0.0\n",
|
||||||
|
"9 0.0 2.0 2.0 1.0 1.0\n",
|
||||||
|
"10 2.0 2.0 0.0 1.0 2.0\n",
|
||||||
|
"11 2.0 1.0 1.0 1.0 0.0\n",
|
||||||
|
"12 0.0 1.0 0.0 2.0 0.0\n",
|
||||||
|
"13 2.0 1.0 0.0 1.0 2.0\n",
|
||||||
|
"14 1.0 0.0 0.0 1.0 1.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
|
||||||
|
"new_df"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": ".venv_microwave (3.13.2)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
1234
notebooks/demo_univariate_aggregates.ipynb
Normal file
1234
notebooks/demo_univariate_aggregates.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
requirements.txt
Normal file
BIN
requirements.txt
Normal file
Binary file not shown.
Reference in New Issue
Block a user