Compare commits
1 Commits
main
...
timing_fun
| Author | SHA1 | Date | |
|---|---|---|---|
| 911a6c6f06 |
1
__init__.py
Normal file
1
__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
import utils
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
from . import data_analysis
|
|
||||||
from . import utils
|
|
||||||
from . import math
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
from . import ppscore
|
|
||||||
from . import univariate
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
from .ppscore import *
|
|
||||||
@@ -1,276 +0,0 @@
|
|||||||
from typing import Union, Callable, Optional, Any
|
|
||||||
from sklearn.metrics import make_scorer
|
|
||||||
from sklearn.tree import DecisionTreeRegressor
|
|
||||||
from sklearn.base import is_classifier, is_regressor
|
|
||||||
from sklearn.model_selection import cross_val_score
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def _identify_case(model) -> str:
|
|
||||||
"""
|
|
||||||
Identifies if the given model is a classifier or regressor.
|
|
||||||
Args:
|
|
||||||
model: Must be sklearn-compatible and either a regressor of classifier.
|
|
||||||
Returns:
|
|
||||||
"classification" or "regression"
|
|
||||||
Raises:
|
|
||||||
ValueError: If the model cannot be determined to be either a classifier or a regressor
|
|
||||||
"""
|
|
||||||
if is_classifier(model):
|
|
||||||
return "classification"
|
|
||||||
elif is_regressor(model):
|
|
||||||
return "regression"
|
|
||||||
else:
|
|
||||||
raise ValueError("The model cannot be determined to be either a classifier or a regressor")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
|
|
||||||
"""
|
|
||||||
Calculates the expected metric result of a naive model against y.
|
|
||||||
Args:
|
|
||||||
y: shape[n,1]; True values
|
|
||||||
case: "classification" or "regression"
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
Returns:
|
|
||||||
A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
y = _sample(y, ~nan_rows_mask(y))
|
|
||||||
if case == "regression":
|
|
||||||
base = np.full_like(y, np.median(y))
|
|
||||||
elif case == "classification":
|
|
||||||
values, counts = np.unique(y, return_counts=True)
|
|
||||||
ind = np.argmax(counts)
|
|
||||||
base = np.full_like(y, values[ind])
|
|
||||||
return metric(y, base)
|
|
||||||
|
|
||||||
|
|
||||||
def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Calculates the base information depending on the model, metric and true values.
|
|
||||||
Args:
|
|
||||||
x: shape[n, m]; Predictors (features).
|
|
||||||
y: shape[n, 1]; True values (targets).
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
|
||||||
Returns:
|
|
||||||
A DataFrame containing the pps informations, including:
|
|
||||||
- ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
|
|
||||||
- case: The type of model ("classifier" or "regressor").
|
|
||||||
- metric: The name of the metric used.
|
|
||||||
- perfect_score: The score when the model's predictions are perfect.
|
|
||||||
- naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
|
|
||||||
- model_score: Placeholder for the model score, initialized to NaN.
|
|
||||||
- model: The type of the model.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
y = _sample(y, ~nan_rows_mask(y))
|
|
||||||
case = _identify_case(model)
|
|
||||||
baseline_score = _get_baseline_score(y, case, metric)
|
|
||||||
perfect_score = metric(y, y)
|
|
||||||
return pd.DataFrame({
|
|
||||||
"ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
|
|
||||||
"case": case,
|
|
||||||
"metric": metric.__name__,
|
|
||||||
"perfect_score": perfect_score,
|
|
||||||
"naive_score": baseline_score,
|
|
||||||
"model_score": np.nan,
|
|
||||||
"model": type(model).__name__
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
|
|
||||||
"""
|
|
||||||
Returns the score according to the given metric of a fitted model
|
|
||||||
Args:
|
|
||||||
x: shape[n, m]; Predictors (features).
|
|
||||||
y: shape[n,1]; True values (targets).
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
|
|
||||||
metric_params (optional): Additional parameters to pass to the metric function.
|
|
||||||
crossvals (optional) [default=5]: Number of cross-validations to perform.
|
|
||||||
Returns:
|
|
||||||
The evaluation metric on the prediction of the model
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if model is None:
|
|
||||||
model = DecisionTreeRegressor()
|
|
||||||
nan_mask = nan_rows_mask(x, y)
|
|
||||||
scores = cross_val_score(
|
|
||||||
model,
|
|
||||||
np.array(_sample(x, ~nan_mask)).reshape(-1, 1),
|
|
||||||
np.array(_sample(y, ~nan_mask)).reshape(-1, 1),
|
|
||||||
cv=crossvals,
|
|
||||||
scoring=make_scorer(metric, **metric_params)
|
|
||||||
)
|
|
||||||
return scores.mean()
|
|
||||||
|
|
||||||
|
|
||||||
def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series],
|
|
||||||
naive_score: Union[int, float, np.ndarray, pd.Series],
|
|
||||||
perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
|
|
||||||
"""
|
|
||||||
Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
|
|
||||||
Args:
|
|
||||||
score: The actual score(s).
|
|
||||||
naive_score: The naive score(s).
|
|
||||||
perfect_score: The perfect score(s).
|
|
||||||
Returns:
|
|
||||||
The predictive power score(s).
|
|
||||||
"""
|
|
||||||
score = np.asarray(score)
|
|
||||||
naive_score = np.asarray(naive_score)
|
|
||||||
perfect_score = np.asarray(perfect_score)
|
|
||||||
pps = (score - naive_score) / (perfect_score - naive_score)
|
|
||||||
pps = np.where(pps <= 0, 0, pps)
|
|
||||||
if isinstance(score, pd.Series):
|
|
||||||
return pd.Series(pps, index=score.index)
|
|
||||||
return pps
|
|
||||||
|
|
||||||
|
|
||||||
def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
|
||||||
Args:
|
|
||||||
x: shape[n,1] or [n,]; Predictors (features).
|
|
||||||
y: shape[n,1] or [n,]; True values (targets).
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
|
|
||||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
|
||||||
sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
|
|
||||||
shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
|
|
||||||
crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
|
|
||||||
Returns:
|
|
||||||
The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
_verify_tabular_data_shape(x, y, is_column=True)
|
|
||||||
x, y = _to_series(x), _to_series(y)
|
|
||||||
x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
|
|
||||||
if model is None:
|
|
||||||
model = DecisionTreeRegressor()
|
|
||||||
res_df = _prepare_df(x, y, metric, model)
|
|
||||||
res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
|
|
||||||
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
|
||||||
return res_df
|
|
||||||
|
|
||||||
|
|
||||||
def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
|
|
||||||
"""
|
|
||||||
Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
|
||||||
Args:
|
|
||||||
df: shape[n, m]; Predictors (features).
|
|
||||||
y: shape[n, 1]; True values (targets).
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
|
||||||
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
|
||||||
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
|
||||||
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
|
||||||
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
|
||||||
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
|
||||||
Returns:
|
|
||||||
A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
_verify_tabular_data_shape(df, is_column=False)
|
|
||||||
_verify_tabular_data_shape(y, is_column=True)
|
|
||||||
y = _to_series(y)
|
|
||||||
df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
|
|
||||||
if model is None:
|
|
||||||
model = DecisionTreeRegressor()
|
|
||||||
res_df = _prepare_df(df, y, metric, model)
|
|
||||||
res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
|
|
||||||
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
|
||||||
res_df.insert(0, "x", df.columns)
|
|
||||||
if sort:
|
|
||||||
res_df = res_df.sort_values("ppscore", ascending=False)
|
|
||||||
return res_df.reset_index(drop=True)
|
|
||||||
|
|
||||||
|
|
||||||
def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
|
|
||||||
Args:
|
|
||||||
df: Input DataFrame containing predictors and the target column.
|
|
||||||
col: Target column name.
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
|
|
||||||
Returns:
|
|
||||||
A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
y = df[[col]]
|
|
||||||
df_pred = df.loc[:, df.columns != col]
|
|
||||||
res = predictors(df_pred, y, metric, model=model, **kwargs)
|
|
||||||
res.insert(1, "y", col)
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Calculates the predictive power score (pps) of every column in df against every other column in df using a given model.
|
|
||||||
Scores will be baselined between 0 and 1 depending on the nature of the problem.
|
|
||||||
Args:
|
|
||||||
df: shape[n, m]
|
|
||||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
|
||||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
|
||||||
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
|
||||||
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
|
||||||
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
|
||||||
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
|
||||||
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
|
||||||
Returns:
|
|
||||||
A DataFrame containing the pps of each predictor in df against every target column.
|
|
||||||
The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
_verify_tabular_data_shape(df, is_column=False)
|
|
||||||
df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
|
|
||||||
if model is None:
|
|
||||||
model = DecisionTreeRegressor()
|
|
||||||
res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
|
|
||||||
res_df = pd.concat(res, axis=0)
|
|
||||||
if sort:
|
|
||||||
res_df = res_df.sort_values("ppscore", ascending=False)
|
|
||||||
return res_df.reset_index(drop=True)
|
|
||||||
|
|
||||||
|
|
||||||
def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
|
|
||||||
"""
|
|
||||||
Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
|
|
||||||
For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations.
|
|
||||||
Continues until no considered pps is over the threshold.
|
|
||||||
Args:
|
|
||||||
matrix: A DataFrame containing the pps of each feature against every other.
|
|
||||||
threshold (optional) [default=0.9]
|
|
||||||
Returns:
|
|
||||||
A list of features that are the most predicted by others
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
features = matrix["x"].unique()
|
|
||||||
cols_predict_count = dict(zip(features, [0]*len(features)))
|
|
||||||
pred_mut = []
|
|
||||||
while True:
|
|
||||||
for _, predict in matrix.iterrows():
|
|
||||||
if predict.y in cols_predict_count and predict.ppscore > threshold:
|
|
||||||
cols_predict_count[predict.y] += predict.ppscore
|
|
||||||
if sum(cols_predict_count.values()) == 0:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
best_predictor = max(cols_predict_count, key=cols_predict_count.get)
|
|
||||||
pred_mut.append(best_predictor)
|
|
||||||
del cols_predict_count[best_predictor]
|
|
||||||
matrix = matrix[matrix["x"] != best_predictor]
|
|
||||||
matrix = matrix[matrix["y"] != best_predictor]
|
|
||||||
return pred_mut
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
from .aggregates import *
|
|
||||||
@@ -1,236 +0,0 @@
|
|||||||
from typing import Any, Union, Optional, Callable
|
|
||||||
from joblib import Parallel, delayed
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import scipy.stats as stats
|
|
||||||
|
|
||||||
|
|
||||||
def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
|
||||||
"""
|
|
||||||
Calculates the mean within a certain percentile range
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
N1: Lower percentile (between 0 and 1)
|
|
||||||
N2: Upper percentile (between 0 and 1)
|
|
||||||
Returns:
|
|
||||||
The IP-mean
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
|
|
||||||
return np.mean(x[(x >= p1) & (x <= p2)])
|
|
||||||
|
|
||||||
|
|
||||||
def median_absolute_deviation(x: np.ndarray) -> float:
|
|
||||||
"""
|
|
||||||
Calculates the median of the deviations from the median
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
Returns:
|
|
||||||
The MAD
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
return np.nanmedian(np.abs(x - np.nanmedian(x)))
|
|
||||||
|
|
||||||
|
|
||||||
def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
|
||||||
"""
|
|
||||||
Calculates the range within a certain percentile range
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
N1: Lower percentile (between 0 and 1)
|
|
||||||
N2: Upper percentile (between 0 and 1)
|
|
||||||
Returns:
|
|
||||||
The IP-range
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
|
|
||||||
|
|
||||||
|
|
||||||
def mode(x: np.ndarray) -> Any:
|
|
||||||
"""
|
|
||||||
Calculates the mode of numeric and categorical variables
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
Returns:
|
|
||||||
The mode
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if pd.api.types.is_numeric_dtype(x):
|
|
||||||
return stats.mode(x, nan_policy='omit').mode
|
|
||||||
else:
|
|
||||||
return pd.Series(x).mode().iat[0]
|
|
||||||
|
|
||||||
|
|
||||||
def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
|
|
||||||
"""
|
|
||||||
https://xkcd.com/2435/
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
iter:
|
|
||||||
Returns:
|
|
||||||
The geothmetic meandian
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if iter == 0:
|
|
||||||
return x[0]
|
|
||||||
return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
|
|
||||||
|
|
||||||
|
|
||||||
def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
|
|
||||||
"""
|
|
||||||
Calculates the amount of outliers with the zscore method.
|
|
||||||
Args:
|
|
||||||
x: The considered ndarray.
|
|
||||||
n_sig: Number of standard deviations before being considered an outlier
|
|
||||||
Returns:
|
|
||||||
The number of outliers
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
|
|
||||||
|
|
||||||
|
|
||||||
AGGFUNCCODES = {
|
|
||||||
# Counts
|
|
||||||
"size": len,
|
|
||||||
"non-null": lambda x: len(x) - pd.isna(x).sum(),
|
|
||||||
"nunique": lambda x: pd.Series(x).nunique(dropna=True),
|
|
||||||
|
|
||||||
# Basic
|
|
||||||
"sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
|
|
||||||
"min": np.nanmin,
|
|
||||||
"max": np.nanmax,
|
|
||||||
"first": lambda x: x[~pd.isna(x)][0],
|
|
||||||
"last": lambda x: x[~pd.isna(x)][-1],
|
|
||||||
|
|
||||||
# Centricity
|
|
||||||
"mean": np.nanmean,
|
|
||||||
"median": np.nanmedian,
|
|
||||||
"mode": mode,
|
|
||||||
"gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
|
|
||||||
"hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
|
|
||||||
"Pmean": inter_percentile_mean,
|
|
||||||
"geothmetic meandian": geothmetic_meandian,
|
|
||||||
|
|
||||||
# Dispersion
|
|
||||||
"variance": np.nanvar,
|
|
||||||
"std": np.nanstd,
|
|
||||||
"mad": median_absolute_deviation,
|
|
||||||
"skewness": lambda x: stats.skew(x, nan_policy='omit'),
|
|
||||||
"excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
|
|
||||||
"range": lambda x: np.nanmax(x) - np.nanmin(x),
|
|
||||||
"Prange": inter_percentile_range,
|
|
||||||
"n_outliers": get_n_outliers,
|
|
||||||
|
|
||||||
# Percentiles
|
|
||||||
"P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
|
|
||||||
"P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
|
|
||||||
"P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
|
|
||||||
"P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
|
|
||||||
"PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
|
|
||||||
|
|
||||||
# Distribution
|
|
||||||
"skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
|
|
||||||
"kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
|
|
||||||
"normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
|
|
||||||
"jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
|
|
||||||
"shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
|
|
||||||
"anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
|
|
||||||
|
|
||||||
# Other
|
|
||||||
"energy": lambda x: np.nansum(x**2),
|
|
||||||
"rms": lambda x: np.sqrt(np.nanmean(x**2)),
|
|
||||||
"entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
|
|
||||||
"autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
|
|
||||||
"""
|
|
||||||
Executes a given aggregation function on a given data.
|
|
||||||
If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
|
|
||||||
Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
|
|
||||||
Args:
|
|
||||||
x: The data to execute the function on.
|
|
||||||
agg: The function to execute.
|
|
||||||
Returns:
|
|
||||||
Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
ret_names = None
|
|
||||||
if callable(agg):
|
|
||||||
ret = agg(x)
|
|
||||||
elif isinstance(agg, str):
|
|
||||||
ret = AGGFUNCCODES[agg](x)
|
|
||||||
elif isinstance(agg, dict):
|
|
||||||
kwargs = {} if 'kwargs' not in agg else agg['kwargs']
|
|
||||||
if callable(agg['func']):
|
|
||||||
ret = agg['func'](x, **kwargs)
|
|
||||||
elif isinstance(agg['func'], str):
|
|
||||||
ret = AGGFUNCCODES[agg['func']](x, **kwargs)
|
|
||||||
if 'ret_names' in agg:
|
|
||||||
ret_names = agg['ret_names']
|
|
||||||
if isinstance(ret, tuple):
|
|
||||||
if ret_names is None:
|
|
||||||
ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
|
|
||||||
return dict(zip(ret_names, ret))
|
|
||||||
else:
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Executes a given list of aggregation functions on a given data.
|
|
||||||
Args:
|
|
||||||
x: The data to execute the functions on.
|
|
||||||
aggs: The functions to execute.
|
|
||||||
Returns:
|
|
||||||
A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
results = {}
|
|
||||||
for i, func in enumerate(aggs):
|
|
||||||
funcname = f"func_{i}"
|
|
||||||
try:
|
|
||||||
ret = execute_agg_func(x, func)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"agg_{i}", func, e)
|
|
||||||
ret = np.nan
|
|
||||||
if isinstance(func, dict):
|
|
||||||
if 'name' in func:
|
|
||||||
funcname = func['name']
|
|
||||||
elif isinstance(func['func'], str):
|
|
||||||
funcname = func['func']
|
|
||||||
elif isinstance(func, str):
|
|
||||||
funcname = func if func not in results.keys() else f"{func}_{i}"
|
|
||||||
if isinstance(ret, dict):
|
|
||||||
results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
|
|
||||||
else:
|
|
||||||
results[funcname] = ret
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Calculates specified univariate statistics for each column in the DataFrame.
|
|
||||||
Args:
|
|
||||||
df: The input DataFrame.
|
|
||||||
agg: List of aggregation functions to apply.
|
|
||||||
Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
|
|
||||||
n_jobs: number of parallel processes to open. -1 means as many as possible.
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: DataFrame with one row per specified aggregation.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if agg == "all":
|
|
||||||
agg = list(AGGFUNCCODES.keys())
|
|
||||||
results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
|
|
||||||
return pd.DataFrame(results, index=df.columns)
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
from .df_preprocessing import *
|
|
||||||
from .dfTransformer import dfTransformer
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
class dfTransformer(BaseEstimator, TransformerMixin):
|
|
||||||
def __init__(self):
|
|
||||||
self.transforms = []
|
|
||||||
|
|
||||||
def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
|
|
||||||
"""
|
|
||||||
Adds a transform specific to a column with optional result column names.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
column_name (str): The name of the column to transform. For dfTransformer, use None.
|
|
||||||
transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
|
|
||||||
result_columns (list of str, optional): List of names for the resulting columns. Default is None.
|
|
||||||
"""
|
|
||||||
if not hasattr(transformer, 'transform'):
|
|
||||||
raise ValueError("The transformer must have a 'transform' method.")
|
|
||||||
self.transforms.append((column_name, transformer, result_columns))
|
|
||||||
|
|
||||||
def fit(self, X: pd.DataFrame, y=None):
|
|
||||||
"""
|
|
||||||
Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
X (pd.DataFrame): The DataFrame to fit.
|
|
||||||
y: Ignored.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
self: Fitted transformer.
|
|
||||||
"""
|
|
||||||
for column_name, transformer, _ in self.transforms:
|
|
||||||
if isinstance(transformer, dfTransformer):
|
|
||||||
transformer.fit(X, y)
|
|
||||||
elif column_name in X:
|
|
||||||
transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column
|
|
||||||
return self
|
|
||||||
|
|
||||||
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Applies all stored transformations to the DataFrame, drops the original columns,
|
|
||||||
and returns the transformed DataFrame.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
X (pd.DataFrame): The DataFrame to transform.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: The transformed DataFrame.
|
|
||||||
"""
|
|
||||||
X_transformed = X.copy()
|
|
||||||
|
|
||||||
for column_name, transformer, result_columns in self.transforms:
|
|
||||||
if isinstance(transformer, dfTransformer):
|
|
||||||
X_transformed = transformer.transform(X_transformed)
|
|
||||||
if column_name in X_transformed:
|
|
||||||
transformed_data = transformer.transform(X_transformed[[column_name]])
|
|
||||||
|
|
||||||
# Check if the transformed data is a DataFrame; if not, convert it
|
|
||||||
if isinstance(transformed_data, pd.DataFrame):
|
|
||||||
transformed_cols = transformed_data
|
|
||||||
else:
|
|
||||||
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
|
||||||
if result_columns:
|
|
||||||
transformed_cols.columns = result_columns
|
|
||||||
else:
|
|
||||||
transformed_cols.columns = [
|
|
||||||
f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
|
|
||||||
]
|
|
||||||
|
|
||||||
X_transformed.drop(columns=[column_name], inplace=True)
|
|
||||||
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
|
||||||
|
|
||||||
return X_transformed
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from .dfTransformer import dfTransformer
|
|
||||||
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
|
|
||||||
|
|
||||||
|
|
||||||
def _get_encoder(encoding):
|
|
||||||
if encoding == 'onehot':
|
|
||||||
return OneHotEncoder(sparse_output=False)
|
|
||||||
elif encoding == 'label':
|
|
||||||
return LabelEncoder()
|
|
||||||
elif encoding == 'ordinal':
|
|
||||||
return OrdinalEncoder()
|
|
||||||
else:
|
|
||||||
raise ValueError("Unsupported encoding type.")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
|
|
||||||
if len(ret_shape) == 1:
|
|
||||||
ret_shape.append(1)
|
|
||||||
if hasattr(encoder, "categories_"):
|
|
||||||
colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
|
|
||||||
elif ret_shape[1] == 1:
|
|
||||||
colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
|
|
||||||
elif ret_shape[1] > 1:
|
|
||||||
colnames = [
|
|
||||||
"_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
|
|
||||||
]
|
|
||||||
return colnames
|
|
||||||
|
|
||||||
|
|
||||||
def df_to_numeric(df, encoding='onehot'):
|
|
||||||
"""
|
|
||||||
Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): The input DataFrame to process.
|
|
||||||
encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
|
|
||||||
"""
|
|
||||||
transformer = dfTransformer()
|
|
||||||
X_transformed = df.copy()
|
|
||||||
|
|
||||||
if isinstance(encoding, str):
|
|
||||||
encoder = _get_encoder(encoding)
|
|
||||||
elif callable(encoding):
|
|
||||||
encoder = encoding
|
|
||||||
else:
|
|
||||||
raise ValueError("Encoding must be either a string or a callable transformer.")
|
|
||||||
|
|
||||||
for column in X_transformed.columns:
|
|
||||||
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
||||||
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
|
||||||
result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
|
|
||||||
if isinstance(transformed_data, pd.DataFrame):
|
|
||||||
transformed_cols = transformed_data
|
|
||||||
else:
|
|
||||||
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
|
||||||
transformed_cols.columns = result_columns
|
|
||||||
|
|
||||||
transformer.add_transform(column, encoder, result_columns=result_columns)
|
|
||||||
|
|
||||||
X_transformed.drop(columns=[column], inplace=True)
|
|
||||||
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
|
||||||
|
|
||||||
encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
|
|
||||||
for column in X_transformed.columns:
|
|
||||||
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
|
||||||
transformer.add_transform(column, encoder, result_columns=["column"])
|
|
||||||
|
|
||||||
X_transformed.drop(columns=[column], inplace=True)
|
|
||||||
X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
|
|
||||||
|
|
||||||
return X_transformed, transformer
|
|
||||||
@@ -1,39 +0,0 @@
|
|||||||
import math
|
|
||||||
from typing import Optional, Union
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
|
|
||||||
"""
|
|
||||||
Calculates the height of a specified gaussian at point x
|
|
||||||
Args:
|
|
||||||
x: point(s) at which to calculate the height
|
|
||||||
mu: The gaussian's mean
|
|
||||||
sig: The gaussian's standard deviation
|
|
||||||
Returns:
|
|
||||||
The height(s), as unique number or ndarray if x is ndarray
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
|
|
||||||
|
|
||||||
|
|
||||||
def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
|
|
||||||
"""
|
|
||||||
Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
|
|
||||||
Args:
|
|
||||||
mu: The gaussian's mean
|
|
||||||
sig: The gaussian's standard deviation
|
|
||||||
a: lower bound, -inf if None
|
|
||||||
b: upper bound, inf if None
|
|
||||||
Returns:
|
|
||||||
The undefinite integral
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if sig == 0:
|
|
||||||
if mu >= a and mu < b:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from .arrayutils import _get_shape
|
|
||||||
from .arrayutils import _verify_tabular_data_shape
|
|
||||||
from .arrayutils import _verify_same_number_of_rows
|
|
||||||
from .arrayutils import _sample
|
|
||||||
from .arrayutils import sample_rows
|
|
||||||
from .arrayutils import nan_rows_mask
|
|
||||||
from .arrayutils import _to_series
|
|
||||||
from .arrayutils import _is_convertible_to_numpy_array
|
|
||||||
from .arrayutils import split_rows
|
|
||||||
@@ -1,224 +0,0 @@
|
|||||||
from typing import Any, Union, Optional
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def _get_shape(x: Any) -> tuple:
|
|
||||||
"""
|
|
||||||
Returns the shape of a given object
|
|
||||||
Args:
|
|
||||||
x
|
|
||||||
Returns:
|
|
||||||
shape of x
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if hasattr(x, "shape"):
|
|
||||||
return x.shape
|
|
||||||
elif _is_convertible_to_numpy_array(x):
|
|
||||||
return np.array(x).shape
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
|
|
||||||
"""
|
|
||||||
Verifies that the shape of the given objects are coherent for tabular data.
|
|
||||||
Args:
|
|
||||||
*args: shape[n,m] or [n,]; Tabular data.
|
|
||||||
is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
|
|
||||||
Returns:
|
|
||||||
Nothing
|
|
||||||
Raises:
|
|
||||||
ValueError: If one or more of the given objects is not coherent with tabular data.
|
|
||||||
ValueError: If is_column is true and one or more of the given objects have multiple columns.
|
|
||||||
"""
|
|
||||||
for arg in args:
|
|
||||||
shape = _get_shape(arg)
|
|
||||||
if shape is None:
|
|
||||||
raise ValueError(f"Input data has no shape: {arg}.")
|
|
||||||
if len(shape) < 1 or len(shape) > 2:
|
|
||||||
raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
|
|
||||||
if is_column and len(shape) == 2 and 1 not in shape:
|
|
||||||
raise ValueError(f"Input data must be a single column. Has shape {shape}.")
|
|
||||||
|
|
||||||
|
|
||||||
def _verify_same_number_of_rows(*args):
|
|
||||||
"""
|
|
||||||
Verifies that the the given objects hve the same number of rows.
|
|
||||||
Args:
|
|
||||||
*args: shape[n,m] or [n,]
|
|
||||||
Returns:
|
|
||||||
Nothing
|
|
||||||
Raises:
|
|
||||||
ValueError: If one or more of the given objects has no rows.
|
|
||||||
ValueError: If two objects have different amounts of rows.
|
|
||||||
"""
|
|
||||||
n_rows =_get_shape(args[0])[0]
|
|
||||||
for arg in args[1:]:
|
|
||||||
elem_rows = _get_shape(arg)[0]
|
|
||||||
if elem_rows is None:
|
|
||||||
raise ValueError(f"Input data has no shape: {arg}.")
|
|
||||||
elif n_rows != elem_rows:
|
|
||||||
raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
|
|
||||||
|
|
||||||
|
|
||||||
def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
|
|
||||||
"""
|
|
||||||
Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
|
|
||||||
Args:
|
|
||||||
x: The array or DataFrame to be shuffled.
|
|
||||||
ind_list: The list or array of indices that defines the new order of the rows.
|
|
||||||
Returns:
|
|
||||||
The shuffled array or DataFrame.
|
|
||||||
Raises:
|
|
||||||
TypeError: If the input is neither a numpy array nor a pandas dataframe
|
|
||||||
"""
|
|
||||||
if isinstance(x, np.ndarray):
|
|
||||||
return x[ind_list]
|
|
||||||
elif isinstance(x, pd.DataFrame):
|
|
||||||
return x.iloc[ind_list]
|
|
||||||
elif isinstance(x, pd.Series):
|
|
||||||
return x.iloc[ind_list]
|
|
||||||
else:
|
|
||||||
raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
|
|
||||||
|
|
||||||
|
|
||||||
def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
|
|
||||||
"""
|
|
||||||
Samples rows of the provided objects in the same way and optionally shuffles them.
|
|
||||||
Tries to minimize the amount of rows containing nan.
|
|
||||||
Args:
|
|
||||||
*args: Input tabular data objects.
|
|
||||||
sample: Number of samples to draw from each object. If None, no sampling is done.
|
|
||||||
shuffle: If True and sample is None, shuffles the objects.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of shuffled and/or sampled objects.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If input objects don't have the same number of rows.
|
|
||||||
"""
|
|
||||||
_verify_same_number_of_rows(*args)
|
|
||||||
n_rows = _get_shape(args[0])[0]
|
|
||||||
nan_mask = nan_rows_mask(*args)
|
|
||||||
full_indices = np.where(~nan_mask)[0]
|
|
||||||
n_full_rows = len(full_indices)
|
|
||||||
if sample is not None and sample < n_full_rows:
|
|
||||||
indices = np.random.choice(full_indices, size=sample, replace=False)
|
|
||||||
if not shuffle:
|
|
||||||
indices.sort()
|
|
||||||
elif sample is not None and sample < n_rows:
|
|
||||||
indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
|
|
||||||
if not shuffle:
|
|
||||||
indices.sort()
|
|
||||||
else:
|
|
||||||
indices = np.arange(n_rows)
|
|
||||||
if shuffle:
|
|
||||||
indices = np.random.choice(indices, size=n_rows, replace=False)
|
|
||||||
|
|
||||||
results = tuple(_sample(arg, indices) for arg in args)
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def nan_rows_mask(*args: Any) -> np.ndarray:
|
|
||||||
"""
|
|
||||||
Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
|
|
||||||
Args:
|
|
||||||
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
|
||||||
Returns:
|
|
||||||
Boolean mask indicating rows with at least one NaN.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
# Initialize the mask with False values
|
|
||||||
_verify_same_number_of_rows(*args)
|
|
||||||
n_rows = _get_shape(args[0])[0]
|
|
||||||
mask = np.zeros(n_rows, dtype=bool)
|
|
||||||
for data in args:
|
|
||||||
if isinstance(data, np.ndarray):
|
|
||||||
if data.ndim == 1:
|
|
||||||
data = data.reshape(-1,1)
|
|
||||||
mask |= np.isnan(data).any(axis=1)
|
|
||||||
elif isinstance(data, pd.DataFrame):
|
|
||||||
mask |= data.isna().to_numpy().any(axis=1)
|
|
||||||
elif isinstance(data, pd.Series):
|
|
||||||
mask |= data.isna().to_numpy()
|
|
||||||
else:
|
|
||||||
data = np.array(data)
|
|
||||||
if data.ndim == 1:
|
|
||||||
data = data.reshape(-1,1)
|
|
||||||
mask |= np.isnan(data).any(axis=1)
|
|
||||||
return mask
|
|
||||||
|
|
||||||
|
|
||||||
def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
|
|
||||||
"""
|
|
||||||
Convert
|
|
||||||
Args:
|
|
||||||
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
|
||||||
Returns:
|
|
||||||
Boolean mask indicating rows with at least one NaN.
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if isinstance(data, pd.Series):
|
|
||||||
return data
|
|
||||||
elif isinstance(data, pd.DataFrame):
|
|
||||||
if data.shape[1] != 1:
|
|
||||||
raise ValueError("DataFrame must have exactly one column to convert to Series")
|
|
||||||
return data.iloc[:, 0]
|
|
||||||
elif isinstance(data, np.ndarray):
|
|
||||||
if data.ndim == 1:
|
|
||||||
return pd.Series(data)
|
|
||||||
elif data.ndim == 2 and data.shape[1] == 1:
|
|
||||||
return pd.Series(data.ravel())
|
|
||||||
else:
|
|
||||||
raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_convertible_to_numpy_array(obj: Any) -> bool:
|
|
||||||
"""
|
|
||||||
Verifies a given object is convertible to a numpy array without error
|
|
||||||
Args:
|
|
||||||
obj: object to check
|
|
||||||
Returns:
|
|
||||||
bool
|
|
||||||
Raises:
|
|
||||||
Nothing
|
|
||||||
"""
|
|
||||||
if isinstance(obj, (list, tuple, dict, set)):
|
|
||||||
return True
|
|
||||||
if np.isscalar(obj):
|
|
||||||
return True
|
|
||||||
if hasattr(obj, '__array__'):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
|
|
||||||
"""
|
|
||||||
Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
|
|
||||||
Args:
|
|
||||||
data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
|
|
||||||
bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
|
|
||||||
drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
|
|
||||||
Returns:
|
|
||||||
A list of the resulting split pd.DataFrames np.ndarrays.
|
|
||||||
Raises
|
|
||||||
ValueError: If the length of `bool_array` does not match the length of `data`.
|
|
||||||
"""
|
|
||||||
if len(bool_array) != len(data):
|
|
||||||
raise ValueError("The length of bool_array must match the length of data.")
|
|
||||||
|
|
||||||
indices = np.where(bool_array)[0]
|
|
||||||
indices = np.concatenate(([0], indices, [len(data)]))
|
|
||||||
|
|
||||||
if isinstance(data, pd.DataFrame):
|
|
||||||
return [data.iloc[start:end].reset_index(drop=drop_index)
|
|
||||||
for start, end in zip(indices[:-1], indices[1:])
|
|
||||||
if start != end]
|
|
||||||
elif isinstance(data, np.ndarray):
|
|
||||||
return [data[start:end]
|
|
||||||
for start, end in zip(indices[:-1], indices[1:])
|
|
||||||
if start != end]
|
|
||||||
File diff suppressed because one or more lines are too long
@@ -1,725 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import string"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>A</th>\n",
|
|
||||||
" <th>B</th>\n",
|
|
||||||
" <th>C</th>\n",
|
|
||||||
" <th>D</th>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>b</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>c</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>d</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>d</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>c</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>5</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>d</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>6</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>a</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>7</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>a</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>8</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>a</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>b</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>10</th>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>c</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>11</th>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>a</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>12</th>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>a</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>13</th>\n",
|
|
||||||
" <td>2</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>c</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>14</th>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>0</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>b</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" A B C D 0\n",
|
|
||||||
"0 0 2 2 2 b\n",
|
|
||||||
"1 1 2 1 1 c\n",
|
|
||||||
"2 1 0 1 1 d\n",
|
|
||||||
"3 0 0 1 0 d\n",
|
|
||||||
"4 2 1 2 2 c\n",
|
|
||||||
"5 0 0 0 0 d\n",
|
|
||||||
"6 0 2 2 2 a\n",
|
|
||||||
"7 0 2 0 0 a\n",
|
|
||||||
"8 0 1 0 0 a\n",
|
|
||||||
"9 0 2 2 1 b\n",
|
|
||||||
"10 2 2 0 1 c\n",
|
|
||||||
"11 2 1 1 1 a\n",
|
|
||||||
"12 0 1 0 2 a\n",
|
|
||||||
"13 2 1 0 1 c\n",
|
|
||||||
"14 1 0 0 1 b"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
|
|
||||||
"df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
|
|
||||||
"df"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"%cd ..\n",
|
|
||||||
"import microwave.data_processing as dp"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>A</th>\n",
|
|
||||||
" <th>B</th>\n",
|
|
||||||
" <th>C</th>\n",
|
|
||||||
" <th>D</th>\n",
|
|
||||||
" <th>0_encoded_a</th>\n",
|
|
||||||
" <th>0_encoded_b</th>\n",
|
|
||||||
" <th>0_encoded_c</th>\n",
|
|
||||||
" <th>0_encoded_d</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>5</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>6</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>7</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>8</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>10</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>11</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>12</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>13</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>14</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n",
|
|
||||||
"0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n",
|
|
||||||
"1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n",
|
|
||||||
"2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n",
|
|
||||||
"3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n",
|
|
||||||
"4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n",
|
|
||||||
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n",
|
|
||||||
"6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n",
|
|
||||||
"10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
|
||||||
"11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
|
||||||
"14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"new_df, trans = dp.df_to_numeric(df)\n",
|
|
||||||
"new_df"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[(0,\n",
|
|
||||||
" OneHotEncoder(sparse_output=False),\n",
|
|
||||||
" ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
|
|
||||||
" ('A',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('B',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('C',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('D',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('0_encoded_a',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('0_encoded_b',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('0_encoded_c',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column']),\n",
|
|
||||||
" ('0_encoded_d',\n",
|
|
||||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
|
||||||
" ['column'])]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"trans.transforms"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
|
|
||||||
" y = column_or_1d(y, warn=True)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>A</th>\n",
|
|
||||||
" <th>B</th>\n",
|
|
||||||
" <th>C</th>\n",
|
|
||||||
" <th>D</th>\n",
|
|
||||||
" <th>0_encoded</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>3.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>3.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>5</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>3.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>6</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>7</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>8</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>10</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>11</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>12</th>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>13</th>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>2.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>14</th>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>0.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" A B C D 0_encoded\n",
|
|
||||||
"0 0.0 2.0 2.0 2.0 1.0\n",
|
|
||||||
"1 1.0 2.0 1.0 1.0 2.0\n",
|
|
||||||
"2 1.0 0.0 1.0 1.0 3.0\n",
|
|
||||||
"3 0.0 0.0 1.0 0.0 3.0\n",
|
|
||||||
"4 2.0 1.0 2.0 2.0 2.0\n",
|
|
||||||
"5 0.0 0.0 0.0 0.0 3.0\n",
|
|
||||||
"6 0.0 2.0 2.0 2.0 0.0\n",
|
|
||||||
"7 0.0 2.0 0.0 0.0 0.0\n",
|
|
||||||
"8 0.0 1.0 0.0 0.0 0.0\n",
|
|
||||||
"9 0.0 2.0 2.0 1.0 1.0\n",
|
|
||||||
"10 2.0 2.0 0.0 1.0 2.0\n",
|
|
||||||
"11 2.0 1.0 1.0 1.0 0.0\n",
|
|
||||||
"12 0.0 1.0 0.0 2.0 0.0\n",
|
|
||||||
"13 2.0 1.0 0.0 1.0 2.0\n",
|
|
||||||
"14 1.0 0.0 0.0 1.0 1.0"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
|
|
||||||
"new_df"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": ".venv_microwave (3.13.2)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.13.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
import code
|
||||||
67
utils/code.py
Normal file
67
utils/code.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
|
||||||
|
from typing import Callable, Any
|
||||||
|
from time import sleep, perf_counter, process_time
|
||||||
|
|
||||||
|
|
||||||
|
def time_real(func: Callable, *args, **kwargs) -> tuple[float, Any]:
|
||||||
|
"""
|
||||||
|
Measure the elapsed time for a given function in real time.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
func : Callable
|
||||||
|
The function to be measured.
|
||||||
|
*args : tuple
|
||||||
|
Positional arguments to be passed to the function.
|
||||||
|
**kwargs : dict
|
||||||
|
Keyword arguments to be passed to the function.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple[float, Any]
|
||||||
|
A tuple containing the elapsed time in seconds and the return value of the function.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> def my_func(x):
|
||||||
|
... sleep(1)
|
||||||
|
... return x ** 2
|
||||||
|
>>> time_real(my_func, 5)
|
||||||
|
(1, 25)
|
||||||
|
"""
|
||||||
|
start = perf_counter()
|
||||||
|
ret = func(*args, **kwargs)
|
||||||
|
elapsed = perf_counter() - start
|
||||||
|
return elapsed, ret
|
||||||
|
|
||||||
|
|
||||||
|
def time_process(func: Callable, *args, **kwargs) -> tuple[float, Any]:
|
||||||
|
"""
|
||||||
|
Measure the elapsed time for a given function in CPU process time.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
func : Callable
|
||||||
|
The function to be measured.
|
||||||
|
*args : tuple
|
||||||
|
Positional arguments to be passed to the function.
|
||||||
|
**kwargs : dict
|
||||||
|
Keyword arguments to be passed to the function.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple[float, Any]
|
||||||
|
A tuple containing the elapsed time in seconds and the return value of the function.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> def my_func(x):
|
||||||
|
... sleep(1)
|
||||||
|
... return x ** 2
|
||||||
|
>>> time_real(my_func, 5)
|
||||||
|
(0, 25)
|
||||||
|
"""
|
||||||
|
start = process_time()
|
||||||
|
ret = func(*args, **kwargs)
|
||||||
|
elapsed = process_time() - start
|
||||||
|
return elapsed, ret
|
||||||
Reference in New Issue
Block a user