This commit is contained in:
2025-02-27 13:46:56 +01:00
parent 84caa01612
commit 29936cb347
16 changed files with 3720 additions and 0 deletions

3
microwave/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from . import data_analysis
from . import utils
from . import math

View File

@@ -0,0 +1,2 @@
from . import ppscore
from . import univariate

View File

@@ -0,0 +1 @@
from .ppscore import *

View File

@@ -0,0 +1,276 @@
from typing import Union, Callable, Optional, Any
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.base import is_classifier, is_regressor
from sklearn.model_selection import cross_val_score
from joblib import Parallel, delayed
from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series
import pandas as pd
import numpy as np
def _identify_case(model) -> str:
"""
Identifies if the given model is a classifier or regressor.
Args:
model: Must be sklearn-compatible and either a regressor of classifier.
Returns:
"classification" or "regression"
Raises:
ValueError: If the model cannot be determined to be either a classifier or a regressor
"""
if is_classifier(model):
return "classification"
elif is_regressor(model):
return "regression"
else:
raise ValueError("The model cannot be determined to be either a classifier or a regressor")
def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float:
"""
Calculates the expected metric result of a naive model against y.
Args:
y: shape[n,1]; True values
case: "classification" or "regression"
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
Returns:
A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification.
Raises:
Nothing
"""
y = _sample(y, ~nan_rows_mask(y))
if case == "regression":
base = np.full_like(y, np.median(y))
elif case == "classification":
values, counts = np.unique(y, return_counts=True)
ind = np.argmax(counts)
base = np.full_like(y, values[ind])
return metric(y, base)
def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame:
"""
Calculates the base information depending on the model, metric and true values.
Args:
x: shape[n, m]; Predictors (features).
y: shape[n, 1]; True values (targets).
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
model: Model to use. Must be sklearn-compatible and either a regressor of classifier.
Returns:
A DataFrame containing the pps informations, including:
- ppscore: Placeholder for predictive power score, initialized to NaN for each feature.
- case: The type of model ("classifier" or "regressor").
- metric: The name of the metric used.
- perfect_score: The score when the model's predictions are perfect.
- naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor).
- model_score: Placeholder for the model score, initialized to NaN.
- model: The type of the model.
Raises:
Nothing
"""
y = _sample(y, ~nan_rows_mask(y))
case = _identify_case(model)
baseline_score = _get_baseline_score(y, case, metric)
perfect_score = metric(y, y)
return pd.DataFrame({
"ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1),
"case": case,
"metric": metric.__name__,
"perfect_score": perfect_score,
"naive_score": baseline_score,
"model_score": np.nan,
"model": type(model).__name__
})
def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5):
"""
Returns the score according to the given metric of a fitted model
Args:
x: shape[n, m]; Predictors (features).
y: shape[n,1]; True values (targets).
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible.
metric_params (optional): Additional parameters to pass to the metric function.
crossvals (optional) [default=5]: Number of cross-validations to perform.
Returns:
The evaluation metric on the prediction of the model
Raises:
Nothing
"""
if model is None:
model = DecisionTreeRegressor()
nan_mask = nan_rows_mask(x, y)
scores = cross_val_score(
model,
np.array(_sample(x, ~nan_mask)).reshape(-1, 1),
np.array(_sample(y, ~nan_mask)).reshape(-1, 1),
cv=crossvals,
scoring=make_scorer(metric, **metric_params)
)
return scores.mean()
def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series],
naive_score: Union[int, float, np.ndarray, pd.Series],
perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
"""
Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores.
Args:
score: The actual score(s).
naive_score: The naive score(s).
perfect_score: The perfect score(s).
Returns:
The predictive power score(s).
"""
score = np.asarray(score)
naive_score = np.asarray(naive_score)
perfect_score = np.asarray(perfect_score)
pps = (score - naive_score) / (perfect_score - naive_score)
pps = np.where(pps <= 0, 0, pps)
if isinstance(score, pd.Series):
return pd.Series(pps, index=score.index)
return pps
def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame:
"""
Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
Args:
x: shape[n,1] or [n,]; Predictors (features).
y: shape[n,1] or [n,]; True values (targets).
metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes.
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier.
sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling.
shuffle (optional) [default=True]: Whether to shuffle the lines of x and y.
crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model.
Returns:
The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification.
Raises:
Nothing
"""
_verify_tabular_data_shape(x, y, is_column=True)
x, y = _to_series(x), _to_series(y)
x, y = sample_rows(x, y, sample=sample, shuffle=shuffle)
if model is None:
model = DecisionTreeRegressor()
res_df = _prepare_df(x, y, metric, model)
res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals)
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
return res_df
def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True):
"""
Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem.
Args:
df: shape[n, m]; Predictors (features).
y: shape[n, 1]; True values (targets).
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
njobs (optional) [default=1]: Number of jobs to run in parallel.
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
Returns:
A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
Raises:
Nothing
"""
_verify_tabular_data_shape(df, is_column=False)
_verify_tabular_data_shape(y, is_column=True)
y = _to_series(y)
df, y = sample_rows(df, y, sample=sample, shuffle=shuffle)
if model is None:
model = DecisionTreeRegressor()
res_df = _prepare_df(df, y, metric, model)
res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1]))
res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"])
res_df.insert(0, "x", df.columns)
if sort:
res_df = res_df.sort_values("ppscore", ascending=False)
return res_df.reset_index(drop=True)
def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame:
"""
Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric.
Args:
df: Input DataFrame containing predictors and the target column.
col: Target column name.
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
model: Model to use.Must be sklearn-compatible and either a regressor or classifier.
Returns:
A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
Raises:
Nothing
"""
y = df[[col]]
df_pred = df.loc[:, df.columns != col]
res = predictors(df_pred, y, metric, model=model, **kwargs)
res.insert(1, "y", col)
return res
def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame:
"""
Calculates the predictive power score (pps) of every column in df against every other column in df using a given model.
Scores will be baselined between 0 and 1 depending on the nature of the problem.
Args:
df: shape[n, m]
metric: Metric to use to score the prediction. Must take in y_true, y_pred.
model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier.
crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4.
njobs (optional) [default=1]: Number of jobs to run in parallel.
sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling.
sort (optional) [default=True]: Whether to sort the results by ppscore in descending order.
shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing.
Returns:
A DataFrame containing the pps of each predictor in df against every target column.
The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification.
Raises:
Nothing
"""
_verify_tabular_data_shape(df, is_column=False)
df = sample_rows(df, sample=sample, shuffle=shuffle)[0]
if model is None:
model = DecisionTreeRegressor()
res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns)
res_df = pd.concat(res, axis=0)
if sort:
res_df = res_df.sort_values("ppscore", ascending=False)
return res_df.reset_index(drop=True)
def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9):
"""
Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction.
For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations.
Continues until no considered pps is over the threshold.
Args:
matrix: A DataFrame containing the pps of each feature against every other.
threshold (optional) [default=0.9]
Returns:
A list of features that are the most predicted by others
Raises:
Nothing
"""
features = matrix["x"].unique()
cols_predict_count = dict(zip(features, [0]*len(features)))
pred_mut = []
while True:
for _, predict in matrix.iterrows():
if predict.y in cols_predict_count and predict.ppscore > threshold:
cols_predict_count[predict.y] += predict.ppscore
if sum(cols_predict_count.values()) == 0:
break
else:
best_predictor = max(cols_predict_count, key=cols_predict_count.get)
pred_mut.append(best_predictor)
del cols_predict_count[best_predictor]
matrix = matrix[matrix["x"] != best_predictor]
matrix = matrix[matrix["y"] != best_predictor]
return pred_mut

View File

@@ -0,0 +1 @@
from .aggregates import *

View File

@@ -0,0 +1,236 @@
from typing import Any, Union, Optional, Callable
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import scipy.stats as stats
def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
"""
Calculates the mean within a certain percentile range
Args:
x: The considered ndarray.
N1: Lower percentile (between 0 and 1)
N2: Upper percentile (between 0 and 1)
Returns:
The IP-mean
Raises:
Nothing
"""
p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2])
return np.mean(x[(x >= p1) & (x <= p2)])
def median_absolute_deviation(x: np.ndarray) -> float:
"""
Calculates the median of the deviations from the median
Args:
x: The considered ndarray.
Returns:
The MAD
Raises:
Nothing
"""
return np.nanmedian(np.abs(x - np.nanmedian(x)))
def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float:
"""
Calculates the range within a certain percentile range
Args:
x: The considered ndarray.
N1: Lower percentile (between 0 and 1)
N2: Upper percentile (between 0 and 1)
Returns:
The IP-range
Raises:
Nothing
"""
return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1)
def mode(x: np.ndarray) -> Any:
"""
Calculates the mode of numeric and categorical variables
Args:
x: The considered ndarray.
Returns:
The mode
Raises:
Nothing
"""
if pd.api.types.is_numeric_dtype(x):
return stats.mode(x, nan_policy='omit').mode
else:
return pd.Series(x).mode().iat[0]
def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float:
"""
https://xkcd.com/2435/
Args:
x: The considered ndarray.
iter:
Returns:
The geothmetic meandian
Raises:
Nothing
"""
if iter == 0:
return x[0]
return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1)
def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int:
"""
Calculates the amount of outliers with the zscore method.
Args:
x: The considered ndarray.
n_sig: Number of standard deviations before being considered an outlier
Returns:
The number of outliers
Raises:
Nothing
"""
return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig)
AGGFUNCCODES = {
# Counts
"size": len,
"non-null": lambda x: len(x) - pd.isna(x).sum(),
"nunique": lambda x: pd.Series(x).nunique(dropna=True),
# Basic
"sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan,
"min": np.nanmin,
"max": np.nanmax,
"first": lambda x: x[~pd.isna(x)][0],
"last": lambda x: x[~pd.isna(x)][-1],
# Centricity
"mean": np.nanmean,
"median": np.nanmedian,
"mode": mode,
"gmean": lambda x: stats.gmean(x[~pd.isna(x)]),
"hmean": lambda x: stats.hmean(x[~pd.isna(x)]),
"Pmean": inter_percentile_mean,
"geothmetic meandian": geothmetic_meandian,
# Dispersion
"variance": np.nanvar,
"std": np.nanstd,
"mad": median_absolute_deviation,
"skewness": lambda x: stats.skew(x, nan_policy='omit'),
"excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'),
"range": lambda x: np.nanmax(x) - np.nanmin(x),
"Prange": inter_percentile_range,
"n_outliers": get_n_outliers,
# Percentiles
"P75": lambda x: np.percentile(x[~pd.isna(x)], 75),
"P25": lambda x: np.percentile(x[~pd.isna(x)], 25),
"P10": lambda x: np.percentile(x[~pd.isna(x)], 10),
"P90": lambda x: np.percentile(x[~pd.isna(x)], 90),
"PN": lambda x, N: np.percentile(x[~pd.isna(x)], N),
# Distribution
"skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs),
"kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs),
"normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs),
"jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs),
"shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs),
"anderson": lambda x, **kwargs: stats.anderson(x, **kwargs),
# Other
"energy": lambda x: np.nansum(x**2),
"rms": lambda x: np.sqrt(np.nanmean(x**2)),
"entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2),
"autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag)
}
def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any:
"""
Executes a given aggregation function on a given data.
If the return is multiple values, will return a dict with a name for each value that default to a, b, c,...
Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func'
Args:
x: The data to execute the function on.
agg: The function to execute.
Returns:
Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'.
Raises:
Nothing
"""
ret_names = None
if callable(agg):
ret = agg(x)
elif isinstance(agg, str):
ret = AGGFUNCCODES[agg](x)
elif isinstance(agg, dict):
kwargs = {} if 'kwargs' not in agg else agg['kwargs']
if callable(agg['func']):
ret = agg['func'](x, **kwargs)
elif isinstance(agg['func'], str):
ret = AGGFUNCCODES[agg['func']](x, **kwargs)
if 'ret_names' in agg:
ret_names = agg['ret_names']
if isinstance(ret, tuple):
if ret_names is None:
ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))]
return dict(zip(ret_names, ret))
else:
return ret
def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]:
"""
Executes a given list of aggregation functions on a given data.
Args:
x: The data to execute the functions on.
aggs: The functions to execute.
Returns:
A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict.
Raises:
Nothing
"""
results = {}
for i, func in enumerate(aggs):
funcname = f"func_{i}"
try:
ret = execute_agg_func(x, func)
except Exception as e:
print(f"agg_{i}", func, e)
ret = np.nan
if isinstance(func, dict):
if 'name' in func:
funcname = func['name']
elif isinstance(func['func'], str):
funcname = func['func']
elif isinstance(func, str):
funcname = func if func not in results.keys() else f"{func}_{i}"
if isinstance(ret, dict):
results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values())))
else:
results[funcname] = ret
return results
def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame:
"""
Calculates specified univariate statistics for each column in the DataFrame.
Args:
df: The input DataFrame.
agg: List of aggregation functions to apply.
Each element can be a function name (str) or a dict with the function name as the key and args as another dict.
n_jobs: number of parallel processes to open. -1 means as many as possible.
Returns:
pd.DataFrame: DataFrame with one row per specified aggregation.
Raises:
Nothing
"""
if agg == "all":
agg = list(AGGFUNCCODES.keys())
results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns)
return pd.DataFrame(results, index=df.columns)

View File

@@ -0,0 +1,2 @@
from .df_preprocessing import *
from .dfTransformer import dfTransformer

View File

@@ -0,0 +1,75 @@
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional
class dfTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
self.transforms = []
def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None):
"""
Adds a transform specific to a column with optional result column names.
Args:
column_name (str): The name of the column to transform. For dfTransformer, use None.
transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method.
result_columns (list of str, optional): List of names for the resulting columns. Default is None.
"""
if not hasattr(transformer, 'transform'):
raise ValueError("The transformer must have a 'transform' method.")
self.transforms.append((column_name, transformer, result_columns))
def fit(self, X: pd.DataFrame, y=None):
"""
Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns.
Args:
X (pd.DataFrame): The DataFrame to fit.
y: Ignored.
Returns:
self: Fitted transformer.
"""
for column_name, transformer, _ in self.transforms:
if isinstance(transformer, dfTransformer):
transformer.fit(X, y)
elif column_name in X:
transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column
return self
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Applies all stored transformations to the DataFrame, drops the original columns,
and returns the transformed DataFrame.
Args:
X (pd.DataFrame): The DataFrame to transform.
Returns:
pd.DataFrame: The transformed DataFrame.
"""
X_transformed = X.copy()
for column_name, transformer, result_columns in self.transforms:
if isinstance(transformer, dfTransformer):
X_transformed = transformer.transform(X_transformed)
if column_name in X_transformed:
transformed_data = transformer.transform(X_transformed[[column_name]])
# Check if the transformed data is a DataFrame; if not, convert it
if isinstance(transformed_data, pd.DataFrame):
transformed_cols = transformed_data
else:
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
if result_columns:
transformed_cols.columns = result_columns
else:
transformed_cols.columns = [
f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1])
]
X_transformed.drop(columns=[column_name], inplace=True)
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
return X_transformed

View File

@@ -0,0 +1,75 @@
import pandas as pd
from .dfTransformer import dfTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer
def _get_encoder(encoding):
if encoding == 'onehot':
return OneHotEncoder(sparse_output=False)
elif encoding == 'label':
return LabelEncoder()
elif encoding == 'ordinal':
return OrdinalEncoder()
else:
raise ValueError("Unsupported encoding type.")
def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]):
if len(ret_shape) == 1:
ret_shape.append(1)
if hasattr(encoder, "categories_"):
colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]]
elif ret_shape[1] == 1:
colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])]
elif ret_shape[1] > 1:
colnames = [
"_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1])
]
return colnames
def df_to_numeric(df, encoding='onehot'):
"""
Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns.
Args:
df (pd.DataFrame): The input DataFrame to process.
encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable.
Returns:
tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object.
"""
transformer = dfTransformer()
X_transformed = df.copy()
if isinstance(encoding, str):
encoder = _get_encoder(encoding)
elif callable(encoding):
encoder = encoding
else:
raise ValueError("Encoding must be either a string or a callable transformer.")
for column in X_transformed.columns:
if not pd.api.types.is_numeric_dtype(df[column]):
transformed_data = encoder.fit_transform(X_transformed[[column]])
result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape))
if isinstance(transformed_data, pd.DataFrame):
transformed_cols = transformed_data
else:
transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index)
transformed_cols.columns = result_columns
transformer.add_transform(column, encoder, result_columns=result_columns)
X_transformed.drop(columns=[column], inplace=True)
X_transformed = pd.concat([X_transformed, transformed_cols], axis=1)
encoder = FunctionTransformer(lambda x: x.astype(float), validate=False)
for column in X_transformed.columns:
transformed_data = encoder.fit_transform(X_transformed[[column]])
transformer.add_transform(column, encoder, result_columns=["column"])
X_transformed.drop(columns=[column], inplace=True)
X_transformed = pd.concat([X_transformed, transformed_data], axis=1)
return X_transformed, transformer

View File

@@ -0,0 +1,39 @@
import math
from typing import Optional, Union
import numpy as np
def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]:
"""
Calculates the height of a specified gaussian at point x
Args:
x: point(s) at which to calculate the height
mu: The gaussian's mean
sig: The gaussian's standard deviation
Returns:
The height(s), as unique number or ndarray if x is ndarray
Raises:
Nothing
"""
return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi))
def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float:
"""
Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf.
Args:
mu: The gaussian's mean
sig: The gaussian's standard deviation
a: lower bound, -inf if None
b: upper bound, inf if None
Returns:
The undefinite integral
Raises:
Nothing
"""
if sig == 0:
if mu >= a and mu < b:
return 1
else:
return 0
return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2

View File

@@ -0,0 +1,9 @@
from .arrayutils import _get_shape
from .arrayutils import _verify_tabular_data_shape
from .arrayutils import _verify_same_number_of_rows
from .arrayutils import _sample
from .arrayutils import sample_rows
from .arrayutils import nan_rows_mask
from .arrayutils import _to_series
from .arrayutils import _is_convertible_to_numpy_array
from .arrayutils import split_rows

View File

@@ -0,0 +1,224 @@
from typing import Any, Union, Optional
import numpy as np
import pandas as pd
def _get_shape(x: Any) -> tuple:
"""
Returns the shape of a given object
Args:
x
Returns:
shape of x
Raises:
Nothing
"""
if hasattr(x, "shape"):
return x.shape
elif _is_convertible_to_numpy_array(x):
return np.array(x).shape
else:
return None
def _verify_tabular_data_shape(*args: Any, is_column: bool = False):
"""
Verifies that the shape of the given objects are coherent for tabular data.
Args:
*args: shape[n,m] or [n,]; Tabular data.
is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns.
Returns:
Nothing
Raises:
ValueError: If one or more of the given objects is not coherent with tabular data.
ValueError: If is_column is true and one or more of the given objects have multiple columns.
"""
for arg in args:
shape = _get_shape(arg)
if shape is None:
raise ValueError(f"Input data has no shape: {arg}.")
if len(shape) < 1 or len(shape) > 2:
raise ValueError(f"Input data must be a tabular object. Has shape {shape}.")
if is_column and len(shape) == 2 and 1 not in shape:
raise ValueError(f"Input data must be a single column. Has shape {shape}.")
def _verify_same_number_of_rows(*args):
"""
Verifies that the the given objects hve the same number of rows.
Args:
*args: shape[n,m] or [n,]
Returns:
Nothing
Raises:
ValueError: If one or more of the given objects has no rows.
ValueError: If two objects have different amounts of rows.
"""
n_rows =_get_shape(args[0])[0]
for arg in args[1:]:
elem_rows = _get_shape(arg)[0]
if elem_rows is None:
raise ValueError(f"Input data has no shape: {arg}.")
elif n_rows != elem_rows:
raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.")
def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]:
"""
Samples the rows of a numpy array or pandas DataFrame based on a list of indices.
Args:
x: The array or DataFrame to be shuffled.
ind_list: The list or array of indices that defines the new order of the rows.
Returns:
The shuffled array or DataFrame.
Raises:
TypeError: If the input is neither a numpy array nor a pandas dataframe
"""
if isinstance(x, np.ndarray):
return x[ind_list]
elif isinstance(x, pd.DataFrame):
return x.iloc[ind_list]
elif isinstance(x, pd.Series):
return x.iloc[ind_list]
else:
raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series")
def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]:
"""
Samples rows of the provided objects in the same way and optionally shuffles them.
Tries to minimize the amount of rows containing nan.
Args:
*args: Input tabular data objects.
sample: Number of samples to draw from each object. If None, no sampling is done.
shuffle: If True and sample is None, shuffles the objects.
Returns:
Tuple of shuffled and/or sampled objects.
Raises:
ValueError: If input objects don't have the same number of rows.
"""
_verify_same_number_of_rows(*args)
n_rows = _get_shape(args[0])[0]
nan_mask = nan_rows_mask(*args)
full_indices = np.where(~nan_mask)[0]
n_full_rows = len(full_indices)
if sample is not None and sample < n_full_rows:
indices = np.random.choice(full_indices, size=sample, replace=False)
if not shuffle:
indices.sort()
elif sample is not None and sample < n_rows:
indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False)))
if not shuffle:
indices.sort()
else:
indices = np.arange(n_rows)
if shuffle:
indices = np.random.choice(indices, size=n_rows, replace=False)
results = tuple(_sample(arg, indices) for arg in args)
return results
def nan_rows_mask(*args: Any) -> np.ndarray:
"""
Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value.
Args:
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
Returns:
Boolean mask indicating rows with at least one NaN.
Raises:
Nothing
"""
# Initialize the mask with False values
_verify_same_number_of_rows(*args)
n_rows = _get_shape(args[0])[0]
mask = np.zeros(n_rows, dtype=bool)
for data in args:
if isinstance(data, np.ndarray):
if data.ndim == 1:
data = data.reshape(-1,1)
mask |= np.isnan(data).any(axis=1)
elif isinstance(data, pd.DataFrame):
mask |= data.isna().to_numpy().any(axis=1)
elif isinstance(data, pd.Series):
mask |= data.isna().to_numpy()
else:
data = np.array(data)
if data.ndim == 1:
data = data.reshape(-1,1)
mask |= np.isnan(data).any(axis=1)
return mask
def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]):
"""
Convert
Args:
*data_list: List of 2D numpy arrays or DataFrames of same amount of rows.
Returns:
Boolean mask indicating rows with at least one NaN.
Raises:
Nothing
"""
if isinstance(data, pd.Series):
return data
elif isinstance(data, pd.DataFrame):
if data.shape[1] != 1:
raise ValueError("DataFrame must have exactly one column to convert to Series")
return data.iloc[:, 0]
elif isinstance(data, np.ndarray):
if data.ndim == 1:
return pd.Series(data)
elif data.ndim == 2 and data.shape[1] == 1:
return pd.Series(data.ravel())
else:
raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array")
def _is_convertible_to_numpy_array(obj: Any) -> bool:
"""
Verifies a given object is convertible to a numpy array without error
Args:
obj: object to check
Returns:
bool
Raises:
Nothing
"""
if isinstance(obj, (list, tuple, dict, set)):
return True
if np.isscalar(obj):
return True
if hasattr(obj, '__array__'):
return True
return False
def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]:
"""
Splits a pandas DataFrame or a numpy array based on a boolean array indicator.
Args:
data : The input data to split. Can be a pandas DataFrame or a numpy ndarray.
bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`.
drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True.
Returns:
A list of the resulting split pd.DataFrames np.ndarrays.
Raises
ValueError: If the length of `bool_array` does not match the length of `data`.
"""
if len(bool_array) != len(data):
raise ValueError("The length of bool_array must match the length of data.")
indices = np.where(bool_array)[0]
indices = np.concatenate(([0], indices, [len(data)]))
if isinstance(data, pd.DataFrame):
return [data.iloc[start:end].reset_index(drop=drop_index)
for start, end in zip(indices[:-1], indices[1:])
if start != end]
elif isinstance(data, np.ndarray):
return [data[start:end]
for start, end in zip(indices[:-1], indices[1:])
if start != end]

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,725 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0\n",
"0 0 2 2 2 b\n",
"1 1 2 1 1 c\n",
"2 1 0 1 1 d\n",
"3 0 0 1 0 d\n",
"4 2 1 2 2 c\n",
"5 0 0 0 0 d\n",
"6 0 2 2 2 a\n",
"7 0 2 0 0 a\n",
"8 0 1 0 0 a\n",
"9 0 2 2 1 b\n",
"10 2 2 0 1 c\n",
"11 2 1 1 1 a\n",
"12 0 1 0 2 a\n",
"13 2 1 0 1 c\n",
"14 1 0 0 1 b"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
"df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
]
}
],
"source": [
"%cd ..\n",
"import microwave.data_processing as dp"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0_encoded_a</th>\n",
" <th>0_encoded_b</th>\n",
" <th>0_encoded_c</th>\n",
" <th>0_encoded_d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n",
"0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n",
"1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n",
"2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n",
"3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n",
"4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n",
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n",
"6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n",
"7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n",
"10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
"11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n",
"12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n",
"13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
"14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df, trans = dp.df_to_numeric(df)\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" OneHotEncoder(sparse_output=False),\n",
" ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
" ('A',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('B',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('C',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('D',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_a',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_b',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_c',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_d',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column'])]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trans.transforms"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0_encoded</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0_encoded\n",
"0 0.0 2.0 2.0 2.0 1.0\n",
"1 1.0 2.0 1.0 1.0 2.0\n",
"2 1.0 0.0 1.0 1.0 3.0\n",
"3 0.0 0.0 1.0 0.0 3.0\n",
"4 2.0 1.0 2.0 2.0 2.0\n",
"5 0.0 0.0 0.0 0.0 3.0\n",
"6 0.0 2.0 2.0 2.0 0.0\n",
"7 0.0 2.0 0.0 0.0 0.0\n",
"8 0.0 1.0 0.0 0.0 0.0\n",
"9 0.0 2.0 2.0 1.0 1.0\n",
"10 2.0 2.0 0.0 1.0 2.0\n",
"11 2.0 1.0 1.0 1.0 0.0\n",
"12 0.0 1.0 0.0 2.0 0.0\n",
"13 2.0 1.0 0.0 1.0 2.0\n",
"14 1.0 0.0 0.0 1.0 1.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv_microwave (3.13.2)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

BIN
requirements.txt Normal file

Binary file not shown.