Compare commits
1 Commits
timing_fun
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 29936cb347 |
@@ -1 +0,0 @@
|
||||
import utils
|
||||
3
microwave/__init__.py
Normal file
3
microwave/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from . import data_analysis
|
||||
from . import utils
|
||||
from . import math
|
||||
2
microwave/data_analysis/__init__.py
Normal file
2
microwave/data_analysis/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from . import ppscore
|
||||
from . import univariate
|
||||
1
microwave/data_analysis/ppscore/__init__.py
Normal file
1
microwave/data_analysis/ppscore/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .ppscore import *
|
||||
276
microwave/data_analysis/ppscore/ppscore.py
Normal file
276
microwave/data_analysis/ppscore/ppscore.py
Normal file
@@ -0,0 +1,276 @@
|
||||
from typing import Union, Callable, Optional, Any
|
||||
from sklearn.metrics import make_scorer
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.base import is_classifier, is_regressor
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from joblib import Parallel, delayed
|
||||
from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _identify_case(model) -> str:
|
||||
"""
|
||||
Identifies if the given model is a classifier or regressor.
|
||||
Args:
|
||||
model: Must be sklearn-compatible and either a regressor of classifier.
|
||||
Returns:
|
||||
"classification" or "regression"
|
||||
Raises:
|
||||
ValueError: If the model cannot be determined to be either a classifier or a regressor
|
||||
"""
|
||||
if is_classifier(model):
|
||||
return "classification"
|
||||
elif is_regressor(model):
|
||||
return "regression"
|
||||
else:
|
||||
raise ValueError("The model cannot be determined to be either a classifier or a regressor")
|
||||
|
||||
|
||||
def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
|
||||
"""
|
||||
Calculates the expected metric result of a naive model against y.
|
||||
Args:
|
||||
y: shape[n,1]; True values
|
||||
case: "classification" or "regression"
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
Returns:
|
||||
A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
y = _sample(y, ~nan_rows_mask(y))
|
||||
if case == "regression":
|
||||
base = np.full_like(y, np.median(y))
|
||||
elif case == "classification":
|
||||
values, counts = np.unique(y, return_counts=True)
|
||||
ind = np.argmax(counts)
|
||||
base = np.full_like(y, values[ind])
|
||||
return metric(y, base)
|
||||
|
||||
|
||||
def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates the base information depending on the model, metric and true values.
|
||||
Args:
|
||||
x: shape[n, m]; Predictors (features).
|
||||
y: shape[n, 1]; True values (targets).
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
||||
Returns:
|
||||
A DataFrame containing the pps informations, including:
|
||||
- ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
|
||||
- case: The type of model ("classifier" or "regressor").
|
||||
- metric: The name of the metric used.
|
||||
- perfect_score: The score when the model's predictions are perfect.
|
||||
- naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
|
||||
- model_score: Placeholder for the model score, initialized to NaN.
|
||||
- model: The type of the model.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
y = _sample(y, ~nan_rows_mask(y))
|
||||
case = _identify_case(model)
|
||||
baseline_score = _get_baseline_score(y, case, metric)
|
||||
perfect_score = metric(y, y)
|
||||
return pd.DataFrame({
|
||||
"ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
|
||||
"case": case,
|
||||
"metric": metric.__name__,
|
||||
"perfect_score": perfect_score,
|
||||
"naive_score": baseline_score,
|
||||
"model_score": np.nan,
|
||||
"model": type(model).__name__
|
||||
})
|
||||
|
||||
|
||||
def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
|
||||
"""
|
||||
Returns the score according to the given metric of a fitted model
|
||||
Args:
|
||||
x: shape[n, m]; Predictors (features).
|
||||
y: shape[n,1]; True values (targets).
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
|
||||
metric_params (optional): Additional parameters to pass to the metric function.
|
||||
crossvals (optional) [default=5]: Number of cross-validations to perform.
|
||||
Returns:
|
||||
The evaluation metric on the prediction of the model
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if model is None:
|
||||
model = DecisionTreeRegressor()
|
||||
nan_mask = nan_rows_mask(x, y)
|
||||
scores = cross_val_score(
|
||||
model,
|
||||
np.array(_sample(x, ~nan_mask)).reshape(-1, 1),
|
||||
np.array(_sample(y, ~nan_mask)).reshape(-1, 1),
|
||||
cv=crossvals,
|
||||
scoring=make_scorer(metric, **metric_params)
|
||||
)
|
||||
return scores.mean()
|
||||
|
||||
|
||||
def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series],
|
||||
naive_score: Union[int, float, np.ndarray, pd.Series],
|
||||
perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
|
||||
"""
|
||||
Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
|
||||
Args:
|
||||
score: The actual score(s).
|
||||
naive_score: The naive score(s).
|
||||
perfect_score: The perfect score(s).
|
||||
Returns:
|
||||
The predictive power score(s).
|
||||
"""
|
||||
score = np.asarray(score)
|
||||
naive_score = np.asarray(naive_score)
|
||||
perfect_score = np.asarray(perfect_score)
|
||||
pps = (score - naive_score) / (perfect_score - naive_score)
|
||||
pps = np.where(pps <= 0, 0, pps)
|
||||
if isinstance(score, pd.Series):
|
||||
return pd.Series(pps, index=score.index)
|
||||
return pps
|
||||
|
||||
|
||||
def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
||||
Args:
|
||||
x: shape[n,1] or [n,]; Predictors (features).
|
||||
y: shape[n,1] or [n,]; True values (targets).
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
|
||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
|
||||
sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
|
||||
shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
|
||||
crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
|
||||
Returns:
|
||||
The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
_verify_tabular_data_shape(x, y, is_column=True)
|
||||
x, y = _to_series(x), _to_series(y)
|
||||
x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
|
||||
if model is None:
|
||||
model = DecisionTreeRegressor()
|
||||
res_df = _prepare_df(x, y, metric, model)
|
||||
res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
|
||||
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
||||
return res_df
|
||||
|
||||
|
||||
def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
|
||||
"""
|
||||
Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
|
||||
Args:
|
||||
df: shape[n, m]; Predictors (features).
|
||||
y: shape[n, 1]; True values (targets).
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
||||
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
||||
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
||||
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
||||
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
||||
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
||||
Returns:
|
||||
A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
_verify_tabular_data_shape(df, is_column=False)
|
||||
_verify_tabular_data_shape(y, is_column=True)
|
||||
y = _to_series(y)
|
||||
df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
|
||||
if model is None:
|
||||
model = DecisionTreeRegressor()
|
||||
res_df = _prepare_df(df, y, metric, model)
|
||||
res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
|
||||
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
|
||||
res_df.insert(0, "x", df.columns)
|
||||
if sort:
|
||||
res_df = res_df.sort_values("ppscore", ascending=False)
|
||||
return res_df.reset_index(drop=True)
|
||||
|
||||
|
||||
def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
|
||||
Args:
|
||||
df: Input DataFrame containing predictors and the target column.
|
||||
col: Target column name.
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
|
||||
Returns:
|
||||
A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
y = df[[col]]
|
||||
df_pred = df.loc[:, df.columns != col]
|
||||
res = predictors(df_pred, y, metric, model=model, **kwargs)
|
||||
res.insert(1, "y", col)
|
||||
return res
|
||||
|
||||
|
||||
def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates the predictive power score (pps) of every column in df against every other column in df using a given model.
|
||||
Scores will be baselined between 0 and 1 depending on the nature of the problem.
|
||||
Args:
|
||||
df: shape[n, m]
|
||||
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
|
||||
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
|
||||
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
|
||||
njobs (optional) [default=1]: Number of jobs to run in parallel.
|
||||
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
|
||||
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
|
||||
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
|
||||
Returns:
|
||||
A DataFrame containing the pps of each predictor in df against every target column.
|
||||
The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
_verify_tabular_data_shape(df, is_column=False)
|
||||
df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
|
||||
if model is None:
|
||||
model = DecisionTreeRegressor()
|
||||
res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
|
||||
res_df = pd.concat(res, axis=0)
|
||||
if sort:
|
||||
res_df = res_df.sort_values("ppscore", ascending=False)
|
||||
return res_df.reset_index(drop=True)
|
||||
|
||||
|
||||
def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
|
||||
"""
|
||||
Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
|
||||
For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations.
|
||||
Continues until no considered pps is over the threshold.
|
||||
Args:
|
||||
matrix: A DataFrame containing the pps of each feature against every other.
|
||||
threshold (optional) [default=0.9]
|
||||
Returns:
|
||||
A list of features that are the most predicted by others
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
features = matrix["x"].unique()
|
||||
cols_predict_count = dict(zip(features, [0]*len(features)))
|
||||
pred_mut = []
|
||||
while True:
|
||||
for _, predict in matrix.iterrows():
|
||||
if predict.y in cols_predict_count and predict.ppscore > threshold:
|
||||
cols_predict_count[predict.y] += predict.ppscore
|
||||
if sum(cols_predict_count.values()) == 0:
|
||||
break
|
||||
else:
|
||||
best_predictor = max(cols_predict_count, key=cols_predict_count.get)
|
||||
pred_mut.append(best_predictor)
|
||||
del cols_predict_count[best_predictor]
|
||||
matrix = matrix[matrix["x"] != best_predictor]
|
||||
matrix = matrix[matrix["y"] != best_predictor]
|
||||
return pred_mut
|
||||
1
microwave/data_analysis/univariate/__init__.py
Normal file
1
microwave/data_analysis/univariate/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .aggregates import *
|
||||
236
microwave/data_analysis/univariate/aggregates.py
Normal file
236
microwave/data_analysis/univariate/aggregates.py
Normal file
@@ -0,0 +1,236 @@
|
||||
from typing import Any, Union, Optional, Callable
|
||||
from joblib import Parallel, delayed
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy.stats as stats
|
||||
|
||||
|
||||
def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
||||
"""
|
||||
Calculates the mean within a certain percentile range
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
N1: Lower percentile (between 0 and 1)
|
||||
N2: Upper percentile (between 0 and 1)
|
||||
Returns:
|
||||
The IP-mean
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
|
||||
return np.mean(x[(x >= p1) & (x <= p2)])
|
||||
|
||||
|
||||
def median_absolute_deviation(x: np.ndarray) -> float:
|
||||
"""
|
||||
Calculates the median of the deviations from the median
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
Returns:
|
||||
The MAD
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
return np.nanmedian(np.abs(x - np.nanmedian(x)))
|
||||
|
||||
|
||||
def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
|
||||
"""
|
||||
Calculates the range within a certain percentile range
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
N1: Lower percentile (between 0 and 1)
|
||||
N2: Upper percentile (between 0 and 1)
|
||||
Returns:
|
||||
The IP-range
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
|
||||
|
||||
|
||||
def mode(x: np.ndarray) -> Any:
|
||||
"""
|
||||
Calculates the mode of numeric and categorical variables
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
Returns:
|
||||
The mode
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if pd.api.types.is_numeric_dtype(x):
|
||||
return stats.mode(x, nan_policy='omit').mode
|
||||
else:
|
||||
return pd.Series(x).mode().iat[0]
|
||||
|
||||
|
||||
def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
|
||||
"""
|
||||
https://xkcd.com/2435/
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
iter:
|
||||
Returns:
|
||||
The geothmetic meandian
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if iter == 0:
|
||||
return x[0]
|
||||
return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
|
||||
|
||||
|
||||
def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
|
||||
"""
|
||||
Calculates the amount of outliers with the zscore method.
|
||||
Args:
|
||||
x: The considered ndarray.
|
||||
n_sig: Number of standard deviations before being considered an outlier
|
||||
Returns:
|
||||
The number of outliers
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
|
||||
|
||||
|
||||
AGGFUNCCODES = {
|
||||
# Counts
|
||||
"size": len,
|
||||
"non-null": lambda x: len(x) - pd.isna(x).sum(),
|
||||
"nunique": lambda x: pd.Series(x).nunique(dropna=True),
|
||||
|
||||
# Basic
|
||||
"sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
|
||||
"min": np.nanmin,
|
||||
"max": np.nanmax,
|
||||
"first": lambda x: x[~pd.isna(x)][0],
|
||||
"last": lambda x: x[~pd.isna(x)][-1],
|
||||
|
||||
# Centricity
|
||||
"mean": np.nanmean,
|
||||
"median": np.nanmedian,
|
||||
"mode": mode,
|
||||
"gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
|
||||
"hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
|
||||
"Pmean": inter_percentile_mean,
|
||||
"geothmetic meandian": geothmetic_meandian,
|
||||
|
||||
# Dispersion
|
||||
"variance": np.nanvar,
|
||||
"std": np.nanstd,
|
||||
"mad": median_absolute_deviation,
|
||||
"skewness": lambda x: stats.skew(x, nan_policy='omit'),
|
||||
"excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
|
||||
"range": lambda x: np.nanmax(x) - np.nanmin(x),
|
||||
"Prange": inter_percentile_range,
|
||||
"n_outliers": get_n_outliers,
|
||||
|
||||
# Percentiles
|
||||
"P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
|
||||
"P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
|
||||
"P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
|
||||
"P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
|
||||
"PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
|
||||
|
||||
# Distribution
|
||||
"skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
|
||||
"kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
|
||||
"normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
|
||||
"jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
|
||||
"shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
|
||||
"anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
|
||||
|
||||
# Other
|
||||
"energy": lambda x: np.nansum(x**2),
|
||||
"rms": lambda x: np.sqrt(np.nanmean(x**2)),
|
||||
"entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
|
||||
"autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
|
||||
}
|
||||
|
||||
|
||||
def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
|
||||
"""
|
||||
Executes a given aggregation function on a given data.
|
||||
If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
|
||||
Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
|
||||
Args:
|
||||
x: The data to execute the function on.
|
||||
agg: The function to execute.
|
||||
Returns:
|
||||
Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
ret_names = None
|
||||
if callable(agg):
|
||||
ret = agg(x)
|
||||
elif isinstance(agg, str):
|
||||
ret = AGGFUNCCODES[agg](x)
|
||||
elif isinstance(agg, dict):
|
||||
kwargs = {} if 'kwargs' not in agg else agg['kwargs']
|
||||
if callable(agg['func']):
|
||||
ret = agg['func'](x, **kwargs)
|
||||
elif isinstance(agg['func'], str):
|
||||
ret = AGGFUNCCODES[agg['func']](x, **kwargs)
|
||||
if 'ret_names' in agg:
|
||||
ret_names = agg['ret_names']
|
||||
if isinstance(ret, tuple):
|
||||
if ret_names is None:
|
||||
ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
|
||||
return dict(zip(ret_names, ret))
|
||||
else:
|
||||
return ret
|
||||
|
||||
|
||||
def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
|
||||
"""
|
||||
Executes a given list of aggregation functions on a given data.
|
||||
Args:
|
||||
x: The data to execute the functions on.
|
||||
aggs: The functions to execute.
|
||||
Returns:
|
||||
A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
results = {}
|
||||
for i, func in enumerate(aggs):
|
||||
funcname = f"func_{i}"
|
||||
try:
|
||||
ret = execute_agg_func(x, func)
|
||||
except Exception as e:
|
||||
print(f"agg_{i}", func, e)
|
||||
ret = np.nan
|
||||
if isinstance(func, dict):
|
||||
if 'name' in func:
|
||||
funcname = func['name']
|
||||
elif isinstance(func['func'], str):
|
||||
funcname = func['func']
|
||||
elif isinstance(func, str):
|
||||
funcname = func if func not in results.keys() else f"{func}_{i}"
|
||||
if isinstance(ret, dict):
|
||||
results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
|
||||
else:
|
||||
results[funcname] = ret
|
||||
return results
|
||||
|
||||
|
||||
def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
|
||||
"""
|
||||
Calculates specified univariate statistics for each column in the DataFrame.
|
||||
Args:
|
||||
df: The input DataFrame.
|
||||
agg: List of aggregation functions to apply.
|
||||
Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
|
||||
n_jobs: number of parallel processes to open. -1 means as many as possible.
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with one row per specified aggregation.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if agg == "all":
|
||||
agg = list(AGGFUNCCODES.keys())
|
||||
results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
|
||||
return pd.DataFrame(results, index=df.columns)
|
||||
2
microwave/data_processing/__init__.py
Normal file
2
microwave/data_processing/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .df_preprocessing import *
|
||||
from .dfTransformer import dfTransformer
|
||||
75
microwave/data_processing/dfTransformer.py
Normal file
75
microwave/data_processing/dfTransformer.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import pandas as pd
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class dfTransformer(BaseEstimator, TransformerMixin):
|
||||
def __init__(self):
|
||||
self.transforms = []
|
||||
|
||||
def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
|
||||
"""
|
||||
Adds a transform specific to a column with optional result column names.
|
||||
|
||||
Args:
|
||||
column_name (str): The name of the column to transform. For dfTransformer, use None.
|
||||
transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
|
||||
result_columns (list of str, optional): List of names for the resulting columns. Default is None.
|
||||
"""
|
||||
if not hasattr(transformer, 'transform'):
|
||||
raise ValueError("The transformer must have a 'transform' method.")
|
||||
self.transforms.append((column_name, transformer, result_columns))
|
||||
|
||||
def fit(self, X: pd.DataFrame, y=None):
|
||||
"""
|
||||
Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
|
||||
|
||||
Args:
|
||||
X (pd.DataFrame): The DataFrame to fit.
|
||||
y: Ignored.
|
||||
|
||||
Returns:
|
||||
self: Fitted transformer.
|
||||
"""
|
||||
for column_name, transformer, _ in self.transforms:
|
||||
if isinstance(transformer, dfTransformer):
|
||||
transformer.fit(X, y)
|
||||
elif column_name in X:
|
||||
transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column
|
||||
return self
|
||||
|
||||
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Applies all stored transformations to the DataFrame, drops the original columns,
|
||||
and returns the transformed DataFrame.
|
||||
|
||||
Args:
|
||||
X (pd.DataFrame): The DataFrame to transform.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The transformed DataFrame.
|
||||
"""
|
||||
X_transformed = X.copy()
|
||||
|
||||
for column_name, transformer, result_columns in self.transforms:
|
||||
if isinstance(transformer, dfTransformer):
|
||||
X_transformed = transformer.transform(X_transformed)
|
||||
if column_name in X_transformed:
|
||||
transformed_data = transformer.transform(X_transformed[[column_name]])
|
||||
|
||||
# Check if the transformed data is a DataFrame; if not, convert it
|
||||
if isinstance(transformed_data, pd.DataFrame):
|
||||
transformed_cols = transformed_data
|
||||
else:
|
||||
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
||||
if result_columns:
|
||||
transformed_cols.columns = result_columns
|
||||
else:
|
||||
transformed_cols.columns = [
|
||||
f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
|
||||
]
|
||||
|
||||
X_transformed.drop(columns=[column_name], inplace=True)
|
||||
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
||||
|
||||
return X_transformed
|
||||
75
microwave/data_processing/df_preprocessing.py
Normal file
75
microwave/data_processing/df_preprocessing.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import pandas as pd
|
||||
from .dfTransformer import dfTransformer
|
||||
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
|
||||
|
||||
|
||||
def _get_encoder(encoding):
|
||||
if encoding == 'onehot':
|
||||
return OneHotEncoder(sparse_output=False)
|
||||
elif encoding == 'label':
|
||||
return LabelEncoder()
|
||||
elif encoding == 'ordinal':
|
||||
return OrdinalEncoder()
|
||||
else:
|
||||
raise ValueError("Unsupported encoding type.")
|
||||
|
||||
|
||||
def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
|
||||
if len(ret_shape) == 1:
|
||||
ret_shape.append(1)
|
||||
if hasattr(encoder, "categories_"):
|
||||
colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
|
||||
elif ret_shape[1] == 1:
|
||||
colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
|
||||
elif ret_shape[1] > 1:
|
||||
colnames = [
|
||||
"_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
|
||||
]
|
||||
return colnames
|
||||
|
||||
|
||||
def df_to_numeric(df, encoding='onehot'):
|
||||
"""
|
||||
Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): The input DataFrame to process.
|
||||
encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
|
||||
"""
|
||||
transformer = dfTransformer()
|
||||
X_transformed = df.copy()
|
||||
|
||||
if isinstance(encoding, str):
|
||||
encoder = _get_encoder(encoding)
|
||||
elif callable(encoding):
|
||||
encoder = encoding
|
||||
else:
|
||||
raise ValueError("Encoding must be either a string or a callable transformer.")
|
||||
|
||||
for column in X_transformed.columns:
|
||||
if not pd.api.types.is_numeric_dtype(df[column]):
|
||||
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
||||
result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
|
||||
if isinstance(transformed_data, pd.DataFrame):
|
||||
transformed_cols = transformed_data
|
||||
else:
|
||||
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
|
||||
transformed_cols.columns = result_columns
|
||||
|
||||
transformer.add_transform(column, encoder, result_columns=result_columns)
|
||||
|
||||
X_transformed.drop(columns=[column], inplace=True)
|
||||
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
|
||||
|
||||
encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
|
||||
for column in X_transformed.columns:
|
||||
transformed_data = encoder.fit_transform(X_transformed[[column]])
|
||||
transformer.add_transform(column, encoder, result_columns=["column"])
|
||||
|
||||
X_transformed.drop(columns=[column], inplace=True)
|
||||
X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
|
||||
|
||||
return X_transformed, transformer
|
||||
39
microwave/math/__init__.py
Normal file
39
microwave/math/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import math
|
||||
from typing import Optional, Union
|
||||
import numpy as np
|
||||
|
||||
|
||||
def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
|
||||
"""
|
||||
Calculates the height of a specified gaussian at point x
|
||||
Args:
|
||||
x: point(s) at which to calculate the height
|
||||
mu: The gaussian's mean
|
||||
sig: The gaussian's standard deviation
|
||||
Returns:
|
||||
The height(s), as unique number or ndarray if x is ndarray
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
|
||||
|
||||
|
||||
def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
|
||||
"""
|
||||
Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
|
||||
Args:
|
||||
mu: The gaussian's mean
|
||||
sig: The gaussian's standard deviation
|
||||
a: lower bound, -inf if None
|
||||
b: upper bound, inf if None
|
||||
Returns:
|
||||
The undefinite integral
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if sig == 0:
|
||||
if mu >= a and mu < b:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2
|
||||
9
microwave/utils/__init__.py
Normal file
9
microwave/utils/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from .arrayutils import _get_shape
|
||||
from .arrayutils import _verify_tabular_data_shape
|
||||
from .arrayutils import _verify_same_number_of_rows
|
||||
from .arrayutils import _sample
|
||||
from .arrayutils import sample_rows
|
||||
from .arrayutils import nan_rows_mask
|
||||
from .arrayutils import _to_series
|
||||
from .arrayutils import _is_convertible_to_numpy_array
|
||||
from .arrayutils import split_rows
|
||||
224
microwave/utils/arrayutils.py
Normal file
224
microwave/utils/arrayutils.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from typing import Any, Union, Optional
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def _get_shape(x: Any) -> tuple:
|
||||
"""
|
||||
Returns the shape of a given object
|
||||
Args:
|
||||
x
|
||||
Returns:
|
||||
shape of x
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if hasattr(x, "shape"):
|
||||
return x.shape
|
||||
elif _is_convertible_to_numpy_array(x):
|
||||
return np.array(x).shape
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
|
||||
"""
|
||||
Verifies that the shape of the given objects are coherent for tabular data.
|
||||
Args:
|
||||
*args: shape[n,m] or [n,]; Tabular data.
|
||||
is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
|
||||
Returns:
|
||||
Nothing
|
||||
Raises:
|
||||
ValueError: If one or more of the given objects is not coherent with tabular data.
|
||||
ValueError: If is_column is true and one or more of the given objects have multiple columns.
|
||||
"""
|
||||
for arg in args:
|
||||
shape = _get_shape(arg)
|
||||
if shape is None:
|
||||
raise ValueError(f"Input data has no shape: {arg}.")
|
||||
if len(shape) < 1 or len(shape) > 2:
|
||||
raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
|
||||
if is_column and len(shape) == 2 and 1 not in shape:
|
||||
raise ValueError(f"Input data must be a single column. Has shape {shape}.")
|
||||
|
||||
|
||||
def _verify_same_number_of_rows(*args):
|
||||
"""
|
||||
Verifies that the the given objects hve the same number of rows.
|
||||
Args:
|
||||
*args: shape[n,m] or [n,]
|
||||
Returns:
|
||||
Nothing
|
||||
Raises:
|
||||
ValueError: If one or more of the given objects has no rows.
|
||||
ValueError: If two objects have different amounts of rows.
|
||||
"""
|
||||
n_rows =_get_shape(args[0])[0]
|
||||
for arg in args[1:]:
|
||||
elem_rows = _get_shape(arg)[0]
|
||||
if elem_rows is None:
|
||||
raise ValueError(f"Input data has no shape: {arg}.")
|
||||
elif n_rows != elem_rows:
|
||||
raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
|
||||
|
||||
|
||||
def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
|
||||
"""
|
||||
Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
|
||||
Args:
|
||||
x: The array or DataFrame to be shuffled.
|
||||
ind_list: The list or array of indices that defines the new order of the rows.
|
||||
Returns:
|
||||
The shuffled array or DataFrame.
|
||||
Raises:
|
||||
TypeError: If the input is neither a numpy array nor a pandas dataframe
|
||||
"""
|
||||
if isinstance(x, np.ndarray):
|
||||
return x[ind_list]
|
||||
elif isinstance(x, pd.DataFrame):
|
||||
return x.iloc[ind_list]
|
||||
elif isinstance(x, pd.Series):
|
||||
return x.iloc[ind_list]
|
||||
else:
|
||||
raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
|
||||
|
||||
|
||||
def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
|
||||
"""
|
||||
Samples rows of the provided objects in the same way and optionally shuffles them.
|
||||
Tries to minimize the amount of rows containing nan.
|
||||
Args:
|
||||
*args: Input tabular data objects.
|
||||
sample: Number of samples to draw from each object. If None, no sampling is done.
|
||||
shuffle: If True and sample is None, shuffles the objects.
|
||||
|
||||
Returns:
|
||||
Tuple of shuffled and/or sampled objects.
|
||||
|
||||
Raises:
|
||||
ValueError: If input objects don't have the same number of rows.
|
||||
"""
|
||||
_verify_same_number_of_rows(*args)
|
||||
n_rows = _get_shape(args[0])[0]
|
||||
nan_mask = nan_rows_mask(*args)
|
||||
full_indices = np.where(~nan_mask)[0]
|
||||
n_full_rows = len(full_indices)
|
||||
if sample is not None and sample < n_full_rows:
|
||||
indices = np.random.choice(full_indices, size=sample, replace=False)
|
||||
if not shuffle:
|
||||
indices.sort()
|
||||
elif sample is not None and sample < n_rows:
|
||||
indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
|
||||
if not shuffle:
|
||||
indices.sort()
|
||||
else:
|
||||
indices = np.arange(n_rows)
|
||||
if shuffle:
|
||||
indices = np.random.choice(indices, size=n_rows, replace=False)
|
||||
|
||||
results = tuple(_sample(arg, indices) for arg in args)
|
||||
return results
|
||||
|
||||
|
||||
def nan_rows_mask(*args: Any) -> np.ndarray:
|
||||
"""
|
||||
Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
|
||||
Args:
|
||||
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
||||
Returns:
|
||||
Boolean mask indicating rows with at least one NaN.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
# Initialize the mask with False values
|
||||
_verify_same_number_of_rows(*args)
|
||||
n_rows = _get_shape(args[0])[0]
|
||||
mask = np.zeros(n_rows, dtype=bool)
|
||||
for data in args:
|
||||
if isinstance(data, np.ndarray):
|
||||
if data.ndim == 1:
|
||||
data = data.reshape(-1,1)
|
||||
mask |= np.isnan(data).any(axis=1)
|
||||
elif isinstance(data, pd.DataFrame):
|
||||
mask |= data.isna().to_numpy().any(axis=1)
|
||||
elif isinstance(data, pd.Series):
|
||||
mask |= data.isna().to_numpy()
|
||||
else:
|
||||
data = np.array(data)
|
||||
if data.ndim == 1:
|
||||
data = data.reshape(-1,1)
|
||||
mask |= np.isnan(data).any(axis=1)
|
||||
return mask
|
||||
|
||||
|
||||
def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
|
||||
"""
|
||||
Convert
|
||||
Args:
|
||||
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
|
||||
Returns:
|
||||
Boolean mask indicating rows with at least one NaN.
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if isinstance(data, pd.Series):
|
||||
return data
|
||||
elif isinstance(data, pd.DataFrame):
|
||||
if data.shape[1] != 1:
|
||||
raise ValueError("DataFrame must have exactly one column to convert to Series")
|
||||
return data.iloc[:, 0]
|
||||
elif isinstance(data, np.ndarray):
|
||||
if data.ndim == 1:
|
||||
return pd.Series(data)
|
||||
elif data.ndim == 2 and data.shape[1] == 1:
|
||||
return pd.Series(data.ravel())
|
||||
else:
|
||||
raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
|
||||
|
||||
|
||||
def _is_convertible_to_numpy_array(obj: Any) -> bool:
|
||||
"""
|
||||
Verifies a given object is convertible to a numpy array without error
|
||||
Args:
|
||||
obj: object to check
|
||||
Returns:
|
||||
bool
|
||||
Raises:
|
||||
Nothing
|
||||
"""
|
||||
if isinstance(obj, (list, tuple, dict, set)):
|
||||
return True
|
||||
if np.isscalar(obj):
|
||||
return True
|
||||
if hasattr(obj, '__array__'):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
|
||||
"""
|
||||
Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
|
||||
Args:
|
||||
data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
|
||||
bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
|
||||
drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
|
||||
Returns:
|
||||
A list of the resulting split pd.DataFrames np.ndarrays.
|
||||
Raises
|
||||
ValueError: If the length of `bool_array` does not match the length of `data`.
|
||||
"""
|
||||
if len(bool_array) != len(data):
|
||||
raise ValueError("The length of bool_array must match the length of data.")
|
||||
|
||||
indices = np.where(bool_array)[0]
|
||||
indices = np.concatenate(([0], indices, [len(data)]))
|
||||
|
||||
if isinstance(data, pd.DataFrame):
|
||||
return [data.iloc[start:end].reset_index(drop=drop_index)
|
||||
for start, end in zip(indices[:-1], indices[1:])
|
||||
if start != end]
|
||||
elif isinstance(data, np.ndarray):
|
||||
return [data[start:end]
|
||||
for start, end in zip(indices[:-1], indices[1:])
|
||||
if start != end]
|
||||
818
notebooks/demo_ppscore.ipynb
Normal file
818
notebooks/demo_ppscore.ipynb
Normal file
File diff suppressed because one or more lines are too long
725
notebooks/demo_processing.ipynb
Normal file
725
notebooks/demo_processing.ipynb
Normal file
@@ -0,0 +1,725 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import string"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>A</th>\n",
|
||||
" <th>B</th>\n",
|
||||
" <th>C</th>\n",
|
||||
" <th>D</th>\n",
|
||||
" <th>0</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>b</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>c</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>d</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>d</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>c</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>d</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>a</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>a</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>a</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>b</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>c</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>a</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>a</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>c</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>b</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" A B C D 0\n",
|
||||
"0 0 2 2 2 b\n",
|
||||
"1 1 2 1 1 c\n",
|
||||
"2 1 0 1 1 d\n",
|
||||
"3 0 0 1 0 d\n",
|
||||
"4 2 1 2 2 c\n",
|
||||
"5 0 0 0 0 d\n",
|
||||
"6 0 2 2 2 a\n",
|
||||
"7 0 2 0 0 a\n",
|
||||
"8 0 1 0 0 a\n",
|
||||
"9 0 2 2 1 b\n",
|
||||
"10 2 2 0 1 c\n",
|
||||
"11 2 1 1 1 a\n",
|
||||
"12 0 1 0 2 a\n",
|
||||
"13 2 1 0 1 c\n",
|
||||
"14 1 0 0 1 b"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
|
||||
"df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%cd ..\n",
|
||||
"import microwave.data_processing as dp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>A</th>\n",
|
||||
" <th>B</th>\n",
|
||||
" <th>C</th>\n",
|
||||
" <th>D</th>\n",
|
||||
" <th>0_encoded_a</th>\n",
|
||||
" <th>0_encoded_b</th>\n",
|
||||
" <th>0_encoded_c</th>\n",
|
||||
" <th>0_encoded_d</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n",
|
||||
"0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n",
|
||||
"1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||
"2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n",
|
||||
"3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n",
|
||||
"4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n",
|
||||
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n",
|
||||
"6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n",
|
||||
"7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
||||
"8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
|
||||
"9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n",
|
||||
"10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||
"11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n",
|
||||
"12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n",
|
||||
"13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
|
||||
"14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"new_df, trans = dp.df_to_numeric(df)\n",
|
||||
"new_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(0,\n",
|
||||
" OneHotEncoder(sparse_output=False),\n",
|
||||
" ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
|
||||
" ('A',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('B',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('C',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('D',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('0_encoded_a',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('0_encoded_b',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('0_encoded_c',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column']),\n",
|
||||
" ('0_encoded_d',\n",
|
||||
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
|
||||
" ['column'])]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"trans.transforms"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
|
||||
" y = column_or_1d(y, warn=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>A</th>\n",
|
||||
" <th>B</th>\n",
|
||||
" <th>C</th>\n",
|
||||
" <th>D</th>\n",
|
||||
" <th>0_encoded</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>3.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>2.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" A B C D 0_encoded\n",
|
||||
"0 0.0 2.0 2.0 2.0 1.0\n",
|
||||
"1 1.0 2.0 1.0 1.0 2.0\n",
|
||||
"2 1.0 0.0 1.0 1.0 3.0\n",
|
||||
"3 0.0 0.0 1.0 0.0 3.0\n",
|
||||
"4 2.0 1.0 2.0 2.0 2.0\n",
|
||||
"5 0.0 0.0 0.0 0.0 3.0\n",
|
||||
"6 0.0 2.0 2.0 2.0 0.0\n",
|
||||
"7 0.0 2.0 0.0 0.0 0.0\n",
|
||||
"8 0.0 1.0 0.0 0.0 0.0\n",
|
||||
"9 0.0 2.0 2.0 1.0 1.0\n",
|
||||
"10 2.0 2.0 0.0 1.0 2.0\n",
|
||||
"11 2.0 1.0 1.0 1.0 0.0\n",
|
||||
"12 0.0 1.0 0.0 2.0 0.0\n",
|
||||
"13 2.0 1.0 0.0 1.0 2.0\n",
|
||||
"14 1.0 0.0 0.0 1.0 1.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
|
||||
"new_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv_microwave (3.13.2)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
1234
notebooks/demo_univariate_aggregates.ipynb
Normal file
1234
notebooks/demo_univariate_aggregates.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
BIN
requirements.txt
Normal file
BIN
requirements.txt
Normal file
Binary file not shown.
@@ -1 +0,0 @@
|
||||
import code
|
||||
@@ -1,67 +0,0 @@
|
||||
|
||||
from typing import Callable, Any
|
||||
from time import sleep, perf_counter, process_time
|
||||
|
||||
|
||||
def time_real(func: Callable, *args, **kwargs) -> tuple[float, Any]:
|
||||
"""
|
||||
Measure the elapsed time for a given function in real time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : Callable
|
||||
The function to be measured.
|
||||
*args : tuple
|
||||
Positional arguments to be passed to the function.
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to the function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple[float, Any]
|
||||
A tuple containing the elapsed time in seconds and the return value of the function.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> def my_func(x):
|
||||
... sleep(1)
|
||||
... return x ** 2
|
||||
>>> time_real(my_func, 5)
|
||||
(1, 25)
|
||||
"""
|
||||
start = perf_counter()
|
||||
ret = func(*args, **kwargs)
|
||||
elapsed = perf_counter() - start
|
||||
return elapsed, ret
|
||||
|
||||
|
||||
def time_process(func: Callable, *args, **kwargs) -> tuple[float, Any]:
|
||||
"""
|
||||
Measure the elapsed time for a given function in CPU process time.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : Callable
|
||||
The function to be measured.
|
||||
*args : tuple
|
||||
Positional arguments to be passed to the function.
|
||||
**kwargs : dict
|
||||
Keyword arguments to be passed to the function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple[float, Any]
|
||||
A tuple containing the elapsed time in seconds and the return value of the function.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> def my_func(x):
|
||||
... sleep(1)
|
||||
... return x ** 2
|
||||
>>> time_real(my_func, 5)
|
||||
(0, 25)
|
||||
"""
|
||||
start = process_time()
|
||||
ret = func(*args, **kwargs)
|
||||
elapsed = process_time() - start
|
||||
return elapsed, ret
|
||||
Reference in New Issue
Block a user