diff --git a/microwave/__init__.py b/microwave/__init__.py new file mode 100644 index 0000000..a19e12c --- /dev/null +++ b/microwave/__init__.py @@ -0,0 +1,3 @@ +from . import data_analysis +from . import utils +from . import math \ No newline at end of file diff --git a/microwave/data_analysis/__init__.py b/microwave/data_analysis/__init__.py new file mode 100644 index 0000000..4a8885c --- /dev/null +++ b/microwave/data_analysis/__init__.py @@ -0,0 +1,2 @@ +from . import ppscore +from . import univariate \ No newline at end of file diff --git a/microwave/data_analysis/ppscore/__init__.py b/microwave/data_analysis/ppscore/__init__.py new file mode 100644 index 0000000..14b9e1f --- /dev/null +++ b/microwave/data_analysis/ppscore/__init__.py @@ -0,0 +1 @@ +from .ppscore import * \ No newline at end of file diff --git a/microwave/data_analysis/ppscore/ppscore.py b/microwave/data_analysis/ppscore/ppscore.py new file mode 100644 index 0000000..1f9ffd3 --- /dev/null +++ b/microwave/data_analysis/ppscore/ppscore.py @@ -0,0 +1,276 @@ +from typing import Union, Callable, Optional, Any +from sklearn.metrics import make_scorer +from sklearn.tree import DecisionTreeRegressor +from sklearn.base import is_classifier, is_regressor +from sklearn.model_selection import cross_val_score +from joblib import Parallel, delayed +from ...utils import _verify_tabular_data_shape, sample_rows, nan_rows_mask, _sample, _to_series +import pandas as pd +import numpy as np + + +def _identify_case(model) -> str: + """ + Identifies if the given model is a classifier or regressor. + Args: + model: Must be sklearn-compatible and either a regressor of classifier. + Returns: + "classification" or "regression" + Raises: + ValueError: If the model cannot be determined to be either a classifier or a regressor + """ + if is_classifier(model): + return "classification" + elif is_regressor(model): + return "regression" + else: + raise ValueError("The model cannot be determined to be either a classifier or a regressor") + + +def _get_baseline_score(y: Union[np.ndarray, pd.DataFrame], case: str, metric: Callable) -> float: + """ + Calculates the expected metric result of a naive model against y. + Args: + y: shape[n,1]; True values + case: "classification" or "regression" + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + Returns: + A baseline score according to the metric. Will be the score of a model predicting the median value for a regression or the most frequent value for a classification. + Raises: + Nothing + """ + y = _sample(y, ~nan_rows_mask(y)) + if case == "regression": + base = np.full_like(y, np.median(y)) + elif case == "classification": + values, counts = np.unique(y, return_counts=True) + ind = np.argmax(counts) + base = np.full_like(y, values[ind]) + return metric(y, base) + + +def _prepare_df(x: Any, y: Any, metric: Callable, model) -> pd.DataFrame: + """ + Calculates the base information depending on the model, metric and true values. + Args: + x: shape[n, m]; Predictors (features). + y: shape[n, 1]; True values (targets). + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + model: Model to use. Must be sklearn-compatible and either a regressor of classifier. + Returns: + A DataFrame containing the pps informations, including: + - ppscore: Placeholder for predictive power score, initialized to NaN for each feature. + - case: The type of model ("classifier" or "regressor"). + - metric: The name of the metric used. + - perfect_score: The score when the model's predictions are perfect. + - naive_score: The score of a naive model predicting the most frequent value (for classifier) or the median value (for regressor). + - model_score: Placeholder for the model score, initialized to NaN. + - model: The type of the model. + Raises: + Nothing + """ + y = _sample(y, ~nan_rows_mask(y)) + case = _identify_case(model) + baseline_score = _get_baseline_score(y, case, metric) + perfect_score = metric(y, y) + return pd.DataFrame({ + "ppscore": [np.nan]*(x.shape[1] if len(x.shape) > 1 else 1), + "case": case, + "metric": metric.__name__, + "perfect_score": perfect_score, + "naive_score": baseline_score, + "model_score": np.nan, + "model": type(model).__name__ + }) + + +def _score(x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame], metric: Callable, model, metric_params: Optional[dict]={}, crossvals: int=5): + """ + Returns the score according to the given metric of a fitted model + Args: + x: shape[n, m]; Predictors (features). + y: shape[n,1]; True values (targets). + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + model: Model to use. Defaults to DecisionTreeRegressor. Must be sklearn-compatible. + metric_params (optional): Additional parameters to pass to the metric function. + crossvals (optional) [default=5]: Number of cross-validations to perform. + Returns: + The evaluation metric on the prediction of the model + Raises: + Nothing + """ + if model is None: + model = DecisionTreeRegressor() + nan_mask = nan_rows_mask(x, y) + scores = cross_val_score( + model, + np.array(_sample(x, ~nan_mask)).reshape(-1, 1), + np.array(_sample(y, ~nan_mask)).reshape(-1, 1), + cv=crossvals, + scoring=make_scorer(metric, **metric_params) + ) + return scores.mean() + + +def _calc_ppscore(score: Union[int, float, np.ndarray, pd.Series], + naive_score: Union[int, float, np.ndarray, pd.Series], + perfect_score: Union[int, float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]: + """ + Calculates the predictive power score (pps) for given scores, naive scores, and perfect scores. + Args: + score: The actual score(s). + naive_score: The naive score(s). + perfect_score: The perfect score(s). + Returns: + The predictive power score(s). + """ + score = np.asarray(score) + naive_score = np.asarray(naive_score) + perfect_score = np.asarray(perfect_score) + pps = (score - naive_score) / (perfect_score - naive_score) + pps = np.where(pps <= 0, 0, pps) + if isinstance(score, pd.Series): + return pd.Series(pps, index=score.index) + return pps + + +def score(x: Any, y: Any, metric: Callable, model: Optional[object]=None, sample: Optional[int]=None, shuffle: bool=True, crossvals: int=5) -> pd.DataFrame: + """ + Calculates the predictive power score (pps) of x against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem. + Args: + x: shape[n,1] or [n,]; Predictors (features). + y: shape[n,1] or [n,]; True values (targets). + metric: Metric to use to score the prediction. Must take in y_true, y_pred, both array_likes. + model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor of classifier. + sample (optional) [default=None]: Wheter to sample the lines of x and y to make calculations faster. None means no sampling. + shuffle (optional) [default=True]: Whether to shuffle the lines of x and y. + crossvals (optional) [default=5]: Amount of crossvalidations to make when fitting and evaluating the model. + Returns: + The pps of x against y. The pps is (score - baseline)/(perfect_score - baseline) where the baseline is a naive model predicting the median for regression or the most common class for classification. + Raises: + Nothing + """ + _verify_tabular_data_shape(x, y, is_column=True) + x, y = _to_series(x), _to_series(y) + x, y = sample_rows(x, y, sample=sample, shuffle=shuffle) + if model is None: + model = DecisionTreeRegressor() + res_df = _prepare_df(x, y, metric, model) + res_df["model_score"] = _score(x, y, metric, model, crossvals=crossvals) + res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"]) + return res_df + + +def predictors(df: pd.DataFrame, y: Any, metric: Callable, model=None, crossvals: int=5, njobs=1, sample: int=5000, sort=True, shuffle=True): + """ + Calculates the predictive power score (pps) of every column in df against y using a given model. Score will be baselined between 0 and 1 depending on the kind of problem. + Args: + df: shape[n, m]; Predictors (features). + y: shape[n, 1]; True values (targets). + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier. + crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4. + njobs (optional) [default=1]: Number of jobs to run in parallel. + sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling. + sort (optional) [default=True]: Whether to sort the results by ppscore in descending order. + shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing. + Returns: + A DataFrame containing the pps of each column in df against y. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification. + Raises: + Nothing + """ + _verify_tabular_data_shape(df, is_column=False) + _verify_tabular_data_shape(y, is_column=True) + y = _to_series(y) + df, y = sample_rows(df, y, sample=sample, shuffle=shuffle) + if model is None: + model = DecisionTreeRegressor() + res_df = _prepare_df(df, y, metric, model) + res_df["model_score"] = Parallel(n_jobs=njobs)(delayed(_score)(df.iloc[:, [i]], y, metric, model) for i in range(df.shape[1])) + res_df["ppscore"] = _calc_ppscore(res_df["model_score"], res_df["naive_score"], res_df["perfect_score"]) + res_df.insert(0, "x", df.columns) + if sort: + res_df = res_df.sort_values("ppscore", ascending=False) + return res_df.reset_index(drop=True) + + +def _predictors_of_col(df: pd.DataFrame, col: str, metric: Callable, model, **kwargs) -> pd.DataFrame: + """ + Calculates the predictive power score (pps) of all columns in df against the specified column using a given model and metric. + Args: + df: Input DataFrame containing predictors and the target column. + col: Target column name. + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + model: Model to use.Must be sklearn-compatible and either a regressor or classifier. + Returns: + A DataFrame containing the pps of each predictor in df against the specified target column. The pps is (score - baseline)/(perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification. + + Raises: + Nothing + """ + y = df[[col]] + df_pred = df.loc[:, df.columns != col] + res = predictors(df_pred, y, metric, model=model, **kwargs) + res.insert(1, "y", col) + return res + + +def matrix(df: pd.DataFrame, metric: Callable, model=None, crossvals: int=5, njobs: int=1, sample: int=5000, sort: bool=True, shuffle: bool=True) -> pd.DataFrame: + """ + Calculates the predictive power score (pps) of every column in df against every other column in df using a given model. + Scores will be baselined between 0 and 1 depending on the nature of the problem. + Args: + df: shape[n, m] + metric: Metric to use to score the prediction. Must take in y_true, y_pred. + model (optional) [default=DecisionTreeRegressor]: Model to use. Must be sklearn-compatible and either a regressor or classifier. + crossvals (optional) [default=5]: Number of cross-validations to perform. Default is 4. + njobs (optional) [default=1]: Number of jobs to run in parallel. + sample (optional) [default=5000]: Number of rows to sample from df and y for faster calculations. None means no sampling. + sort (optional) [default=True]: Whether to sort the results by ppscore in descending order. + shuffle (optional) [default=True]: Whether to shuffle the rows of df and y before processing. + Returns: + A DataFrame containing the pps of each predictor in df against every target column. + The pps is (score - baseline) / (perfect_score - baseline), where the baseline is a naive model predicting the median for regression or the most common class for classification. + Raises: + Nothing + """ + _verify_tabular_data_shape(df, is_column=False) + df = sample_rows(df, sample=sample, shuffle=shuffle)[0] + if model is None: + model = DecisionTreeRegressor() + res = Parallel(n_jobs=njobs)(delayed(_predictors_of_col)(df, col, metric, model, crossvals=crossvals, sample=None, sort=False, shuffle=False) for col in df.columns) + res_df = pd.concat(res, axis=0) + if sort: + res_df = res_df.sort_values("ppscore", ascending=False) + return res_df.reset_index(drop=True) + + +def mutual_predictors(matrix: pd.DataFrame, threshold: float=0.9): + """ + Returns the list of features that are the most predicted by others, above a given threshold. Is intended for dimensionality reduction. + For every feature, every pps above the threshold will be summed, and the most predicted feature will be identified and ignored in subsequent iterations. + Continues until no considered pps is over the threshold. + Args: + matrix: A DataFrame containing the pps of each feature against every other. + threshold (optional) [default=0.9] + Returns: + A list of features that are the most predicted by others + Raises: + Nothing + """ + features = matrix["x"].unique() + cols_predict_count = dict(zip(features, [0]*len(features))) + pred_mut = [] + while True: + for _, predict in matrix.iterrows(): + if predict.y in cols_predict_count and predict.ppscore > threshold: + cols_predict_count[predict.y] += predict.ppscore + if sum(cols_predict_count.values()) == 0: + break + else: + best_predictor = max(cols_predict_count, key=cols_predict_count.get) + pred_mut.append(best_predictor) + del cols_predict_count[best_predictor] + matrix = matrix[matrix["x"] != best_predictor] + matrix = matrix[matrix["y"] != best_predictor] + return pred_mut \ No newline at end of file diff --git a/microwave/data_analysis/univariate/__init__.py b/microwave/data_analysis/univariate/__init__.py new file mode 100644 index 0000000..2da107a --- /dev/null +++ b/microwave/data_analysis/univariate/__init__.py @@ -0,0 +1 @@ +from .aggregates import * \ No newline at end of file diff --git a/microwave/data_analysis/univariate/aggregates.py b/microwave/data_analysis/univariate/aggregates.py new file mode 100644 index 0000000..60ea286 --- /dev/null +++ b/microwave/data_analysis/univariate/aggregates.py @@ -0,0 +1,236 @@ +from typing import Any, Union, Optional, Callable +from joblib import Parallel, delayed +import numpy as np +import pandas as pd +import scipy.stats as stats + + +def inter_percentile_mean(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float: + """ + Calculates the mean within a certain percentile range + Args: + x: The considered ndarray. + N1: Lower percentile (between 0 and 1) + N2: Upper percentile (between 0 and 1) + Returns: + The IP-mean + Raises: + Nothing + """ + p1, p2 = np.percentile(x[~np.isnan(x)], [N1, N2]) + return np.mean(x[(x >= p1) & (x <= p2)]) + + +def median_absolute_deviation(x: np.ndarray) -> float: + """ + Calculates the median of the deviations from the median + Args: + x: The considered ndarray. + Returns: + The MAD + Raises: + Nothing + """ + return np.nanmedian(np.abs(x - np.nanmedian(x))) + + +def inter_percentile_range(x: np.ndarray, N1: Optional[float] = 0.1, N2: Optional[float] = 0.9) -> float: + """ + Calculates the range within a certain percentile range + Args: + x: The considered ndarray. + N1: Lower percentile (between 0 and 1) + N2: Upper percentile (between 0 and 1) + Returns: + The IP-range + Raises: + Nothing + """ + return np.percentile(x[~np.isnan(x)], N2) - np.percentile(x[~np.isnan(x)], N1) + + +def mode(x: np.ndarray) -> Any: + """ + Calculates the mode of numeric and categorical variables + Args: + x: The considered ndarray. + Returns: + The mode + Raises: + Nothing + """ + if pd.api.types.is_numeric_dtype(x): + return stats.mode(x, nan_policy='omit').mode + else: + return pd.Series(x).mode().iat[0] + + +def geothmetic_meandian(x: np.ndarray, iter: Optional[int] = 100) -> float: + """ + https://xkcd.com/2435/ + Args: + x: The considered ndarray. + iter: + Returns: + The geothmetic meandian + Raises: + Nothing + """ + if iter == 0: + return x[0] + return geothmetic_meandian(np.array([AGGFUNCCODES["mean"](x), AGGFUNCCODES["gmean"](x), AGGFUNCCODES["median"](x)]), iter = iter - 1) + + +def get_n_outliers(x: np.ndarray, n_sig: Union[float, int] = 3) -> int: + """ + Calculates the amount of outliers with the zscore method. + Args: + x: The considered ndarray. + n_sig: Number of standard deviations before being considered an outlier + Returns: + The number of outliers + Raises: + Nothing + """ + return np.sum(np.abs(stats.zscore(x, nan_policy='omit')) > n_sig) + + +AGGFUNCCODES = { + # Counts + "size": len, + "non-null": lambda x: len(x) - pd.isna(x).sum(), + "nunique": lambda x: pd.Series(x).nunique(dropna=True), + + # Basic + "sum": lambda x: np.nansum(x) if np.issubdtype((x.to_numpy() if isinstance(x, pd.Series) else x).dtype, np.number) else np.nan, + "min": np.nanmin, + "max": np.nanmax, + "first": lambda x: x[~pd.isna(x)][0], + "last": lambda x: x[~pd.isna(x)][-1], + + # Centricity + "mean": np.nanmean, + "median": np.nanmedian, + "mode": mode, + "gmean": lambda x: stats.gmean(x[~pd.isna(x)]), + "hmean": lambda x: stats.hmean(x[~pd.isna(x)]), + "Pmean": inter_percentile_mean, + "geothmetic meandian": geothmetic_meandian, + + # Dispersion + "variance": np.nanvar, + "std": np.nanstd, + "mad": median_absolute_deviation, + "skewness": lambda x: stats.skew(x, nan_policy='omit'), + "excesskurtosis": lambda x: stats.kurtosis(x, fisher=False, nan_policy='omit'), + "range": lambda x: np.nanmax(x) - np.nanmin(x), + "Prange": inter_percentile_range, + "n_outliers": get_n_outliers, + + # Percentiles + "P75": lambda x: np.percentile(x[~pd.isna(x)], 75), + "P25": lambda x: np.percentile(x[~pd.isna(x)], 25), + "P10": lambda x: np.percentile(x[~pd.isna(x)], 10), + "P90": lambda x: np.percentile(x[~pd.isna(x)], 90), + "PN": lambda x, N: np.percentile(x[~pd.isna(x)], N), + + # Distribution + "skewtest": lambda x, **kwargs: stats.skewtest(x, nan_policy='omit', **kwargs), + "kurtosistest": lambda x, **kwargs: stats.kurtosistest(x, nan_policy='omit', **kwargs), + "normaltest": lambda x, **kwargs: stats.normaltest(x, nan_policy='omit', **kwargs), + "jarque_bera": lambda x, **kwargs: stats.jarque_bera(x, nan_policy='omit', **kwargs), + "shapiro": lambda x, **kwargs: stats.shapiro(x, nan_policy='omit', **kwargs), + "anderson": lambda x, **kwargs: stats.anderson(x, **kwargs), + + # Other + "energy": lambda x: np.nansum(x**2), + "rms": lambda x: np.sqrt(np.nanmean(x**2)), + "entropy": lambda x: stats.entropy(pd.Series(x).value_counts(normalize=True), base=2), + "autocorrelation": lambda x, lag=1: pd.Series(x).autocorr(lag=lag) +} + + +def execute_agg_func(x: Any, agg: Union[Callable, str, dict]) -> Any: + """ + Executes a given aggregation function on a given data. + If the return is multiple values, will return a dict with a name for each value that default to a, b, c,... + Can accept a callable, a str corresponding to an AGGFUNCCODE, or a dictionary with either a callable or an AGGFUNCCODE at key 'func' + Args: + x: The data to execute the function on. + agg: The function to execute. + Returns: + Either the raw return if unique or a dict with named return values if the function returned a tuple. Can rename the values if input a dict with key 'ret_names'. + Raises: + Nothing + """ + ret_names = None + if callable(agg): + ret = agg(x) + elif isinstance(agg, str): + ret = AGGFUNCCODES[agg](x) + elif isinstance(agg, dict): + kwargs = {} if 'kwargs' not in agg else agg['kwargs'] + if callable(agg['func']): + ret = agg['func'](x, **kwargs) + elif isinstance(agg['func'], str): + ret = AGGFUNCCODES[agg['func']](x, **kwargs) + if 'ret_names' in agg: + ret_names = agg['ret_names'] + if isinstance(ret, tuple): + if ret_names is None: + ret_names = [chr(c) for c in range(ord('a'), ord('a')+len(ret))] + return dict(zip(ret_names, ret)) + else: + return ret + + +def calculate_aggregates(x: Any, aggs: list[Union[Callable, str, dict[str, Union[str, Callable]]]]) -> dict[str, Any]: + """ + Executes a given list of aggregation functions on a given data. + Args: + x: The data to execute the functions on. + aggs: The functions to execute. + Returns: + A dict containing named values. The names can be specified in each agg function by specifying 'name', 'ret_names in the dict. + Raises: + Nothing + """ + results = {} + for i, func in enumerate(aggs): + funcname = f"func_{i}" + try: + ret = execute_agg_func(x, func) + except Exception as e: + print(f"agg_{i}", func, e) + ret = np.nan + if isinstance(func, dict): + if 'name' in func: + funcname = func['name'] + elif isinstance(func['func'], str): + funcname = func['func'] + elif isinstance(func, str): + funcname = func if func not in results.keys() else f"{func}_{i}" + if isinstance(ret, dict): + results.update(dict(zip([f"{funcname}_{x}" for x in ret.keys()], ret.values()))) + else: + results[funcname] = ret + return results + + +def build_univariate_statistics(df: pd.DataFrame, agg: Optional[Union[str, list[Union[str, dict[str, dict]]]]] = "all", n_jobs: int = 1) -> pd.DataFrame: + """ + Calculates specified univariate statistics for each column in the DataFrame. + Args: + df: The input DataFrame. + agg: List of aggregation functions to apply. + Each element can be a function name (str) or a dict with the function name as the key and args as another dict. + n_jobs: number of parallel processes to open. -1 means as many as possible. + Returns: + pd.DataFrame: DataFrame with one row per specified aggregation. + Raises: + Nothing + """ + if agg == "all": + agg = list(AGGFUNCCODES.keys()) + results = Parallel(n_jobs=n_jobs)(delayed(calculate_aggregates)(df[col].values, agg) for col in df.columns) + return pd.DataFrame(results, index=df.columns) diff --git a/microwave/data_processing/__init__.py b/microwave/data_processing/__init__.py new file mode 100644 index 0000000..42b6f08 --- /dev/null +++ b/microwave/data_processing/__init__.py @@ -0,0 +1,2 @@ +from .df_preprocessing import * +from .dfTransformer import dfTransformer \ No newline at end of file diff --git a/microwave/data_processing/dfTransformer.py b/microwave/data_processing/dfTransformer.py new file mode 100644 index 0000000..06b9b80 --- /dev/null +++ b/microwave/data_processing/dfTransformer.py @@ -0,0 +1,75 @@ +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from typing import Optional + + +class dfTransformer(BaseEstimator, TransformerMixin): + def __init__(self): + self.transforms = [] + + def add_transform(self, column_name: str, transformer: TransformerMixin, result_columns: Optional[list[str]] = None): + """ + Adds a transform specific to a column with optional result column names. + + Args: + column_name (str): The name of the column to transform. For dfTransformer, use None. + transformer (TransformerMixin): The transformation object to apply to the column. Must have a .transform method. + result_columns (list of str, optional): List of names for the resulting columns. Default is None. + """ + if not hasattr(transformer, 'transform'): + raise ValueError("The transformer must have a 'transform' method.") + self.transforms.append((column_name, transformer, result_columns)) + + def fit(self, X: pd.DataFrame, y=None): + """ + Fit method to conform with TransformerMixin. Fits transformers one by one on specified columns. + + Args: + X (pd.DataFrame): The DataFrame to fit. + y: Ignored. + + Returns: + self: Fitted transformer. + """ + for column_name, transformer, _ in self.transforms: + if isinstance(transformer, dfTransformer): + transformer.fit(X, y) + elif column_name in X: + transformer.fit(X[[column_name]], y) # Fit the transformer on the specific column + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Applies all stored transformations to the DataFrame, drops the original columns, + and returns the transformed DataFrame. + + Args: + X (pd.DataFrame): The DataFrame to transform. + + Returns: + pd.DataFrame: The transformed DataFrame. + """ + X_transformed = X.copy() + + for column_name, transformer, result_columns in self.transforms: + if isinstance(transformer, dfTransformer): + X_transformed = transformer.transform(X_transformed) + if column_name in X_transformed: + transformed_data = transformer.transform(X_transformed[[column_name]]) + + # Check if the transformed data is a DataFrame; if not, convert it + if isinstance(transformed_data, pd.DataFrame): + transformed_cols = transformed_data + else: + transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index) + if result_columns: + transformed_cols.columns = result_columns + else: + transformed_cols.columns = [ + f"{column_name}_transformed_{i}" for i in range(transformed_data.shape[1]) + ] + + X_transformed.drop(columns=[column_name], inplace=True) + X_transformed = pd.concat([X_transformed, transformed_cols], axis=1) + + return X_transformed diff --git a/microwave/data_processing/df_preprocessing.py b/microwave/data_processing/df_preprocessing.py new file mode 100644 index 0000000..8bd18e0 --- /dev/null +++ b/microwave/data_processing/df_preprocessing.py @@ -0,0 +1,75 @@ +import pandas as pd +from .dfTransformer import dfTransformer +from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, FunctionTransformer + + +def _get_encoder(encoding): + if encoding == 'onehot': + return OneHotEncoder(sparse_output=False) + elif encoding == 'label': + return LabelEncoder() + elif encoding == 'ordinal': + return OrdinalEncoder() + else: + raise ValueError("Unsupported encoding type.") + + +def _get_encoded_col_names(encoder, prefix="", suffix="", ret_shape: list = [1]): + if len(ret_shape) == 1: + ret_shape.append(1) + if hasattr(encoder, "categories_"): + colnames = ["_".join([str(x) for x in (prefix, suffix, cat) if len(str(x)) > 0]) for cat in encoder.categories_[0]] + elif ret_shape[1] == 1: + colnames = ["_".join([str(x) for x in (prefix, suffix) if len(str(x)) > 0])] + elif ret_shape[1] > 1: + colnames = [ + "_".join([str(x) for x in (prefix, suffix, i) if len(str(x)) > 0]) for i in range(ret_shape[1]) + ] + return colnames + + +def df_to_numeric(df, encoding='onehot'): + """ + Processes a DataFrame by converting numeric columns to float and applying categorical encoding to non-numeric columns. + + Args: + df (pd.DataFrame): The input DataFrame to process. + encoding (str or TransformerMixin): The encoding method to apply to categorical columns. Can be a string specifying predefined options from sklearn or a callable. + + Returns: + tuple: A tuple containing the transformed DataFrame and the fitted CustomTransformer object. + """ + transformer = dfTransformer() + X_transformed = df.copy() + + if isinstance(encoding, str): + encoder = _get_encoder(encoding) + elif callable(encoding): + encoder = encoding + else: + raise ValueError("Encoding must be either a string or a callable transformer.") + + for column in X_transformed.columns: + if not pd.api.types.is_numeric_dtype(df[column]): + transformed_data = encoder.fit_transform(X_transformed[[column]]) + result_columns = _get_encoded_col_names(encoder, prefix=column, suffix="encoded", ret_shape=list(transformed_data.shape)) + if isinstance(transformed_data, pd.DataFrame): + transformed_cols = transformed_data + else: + transformed_cols = pd.DataFrame(transformed_data, index=X_transformed.index) + transformed_cols.columns = result_columns + + transformer.add_transform(column, encoder, result_columns=result_columns) + + X_transformed.drop(columns=[column], inplace=True) + X_transformed = pd.concat([X_transformed, transformed_cols], axis=1) + + encoder = FunctionTransformer(lambda x: x.astype(float), validate=False) + for column in X_transformed.columns: + transformed_data = encoder.fit_transform(X_transformed[[column]]) + transformer.add_transform(column, encoder, result_columns=["column"]) + + X_transformed.drop(columns=[column], inplace=True) + X_transformed = pd.concat([X_transformed, transformed_data], axis=1) + + return X_transformed, transformer diff --git a/microwave/math/__init__.py b/microwave/math/__init__.py new file mode 100644 index 0000000..990c52d --- /dev/null +++ b/microwave/math/__init__.py @@ -0,0 +1,39 @@ +import math +from typing import Optional, Union +import numpy as np + + +def gaussian(x:Union[np.ndarray, float], mu:float, sig:float) -> Union[np.ndarray, float]: + """ + Calculates the height of a specified gaussian at point x + Args: + x: point(s) at which to calculate the height + mu: The gaussian's mean + sig: The gaussian's standard deviation + Returns: + The height(s), as unique number or ndarray if x is ndarray + Raises: + Nothing + """ + return np.exp(-(x-mu)**2/(2*sig**2))/(sig*np.sqrt(2*np.pi)) + + +def gauss_integral(mu: float, sig: float, a: float=-np.inf, b: float=np.inf) -> float: + """ + Calculates the definite gaussian integral between a and b. If undefined, a and b will respectively be -inf and inf. + Args: + mu: The gaussian's mean + sig: The gaussian's standard deviation + a: lower bound, -inf if None + b: upper bound, inf if None + Returns: + The undefinite integral + Raises: + Nothing + """ + if sig == 0: + if mu >= a and mu < b: + return 1 + else: + return 0 + return (math.erf((b-mu)/(sig*np.sqrt(2)))-math.erf((a-mu)/(sig*np.sqrt(2))))/2 \ No newline at end of file diff --git a/microwave/utils/__init__.py b/microwave/utils/__init__.py new file mode 100644 index 0000000..a82ff3a --- /dev/null +++ b/microwave/utils/__init__.py @@ -0,0 +1,9 @@ +from .arrayutils import _get_shape +from .arrayutils import _verify_tabular_data_shape +from .arrayutils import _verify_same_number_of_rows +from .arrayutils import _sample +from .arrayutils import sample_rows +from .arrayutils import nan_rows_mask +from .arrayutils import _to_series +from .arrayutils import _is_convertible_to_numpy_array +from .arrayutils import split_rows diff --git a/microwave/utils/arrayutils.py b/microwave/utils/arrayutils.py new file mode 100644 index 0000000..d43f76a --- /dev/null +++ b/microwave/utils/arrayutils.py @@ -0,0 +1,224 @@ +from typing import Any, Union, Optional +import numpy as np +import pandas as pd + + +def _get_shape(x: Any) -> tuple: + """ + Returns the shape of a given object + Args: + x + Returns: + shape of x + Raises: + Nothing + """ + if hasattr(x, "shape"): + return x.shape + elif _is_convertible_to_numpy_array(x): + return np.array(x).shape + else: + return None + + +def _verify_tabular_data_shape(*args: Any, is_column: bool = False): + """ + Verifies that the shape of the given objects are coherent for tabular data. + Args: + *args: shape[n,m] or [n,]; Tabular data. + is_column (optional) [default=False]: Set to True to raise an error if an object contains multiple columns. + Returns: + Nothing + Raises: + ValueError: If one or more of the given objects is not coherent with tabular data. + ValueError: If is_column is true and one or more of the given objects have multiple columns. + """ + for arg in args: + shape = _get_shape(arg) + if shape is None: + raise ValueError(f"Input data has no shape: {arg}.") + if len(shape) < 1 or len(shape) > 2: + raise ValueError(f"Input data must be a tabular object. Has shape {shape}.") + if is_column and len(shape) == 2 and 1 not in shape: + raise ValueError(f"Input data must be a single column. Has shape {shape}.") + + +def _verify_same_number_of_rows(*args): + """ + Verifies that the the given objects hve the same number of rows. + Args: + *args: shape[n,m] or [n,] + Returns: + Nothing + Raises: + ValueError: If one or more of the given objects has no rows. + ValueError: If two objects have different amounts of rows. + """ + n_rows =_get_shape(args[0])[0] + for arg in args[1:]: + elem_rows = _get_shape(arg)[0] + if elem_rows is None: + raise ValueError(f"Input data has no shape: {arg}.") + elif n_rows != elem_rows: + raise ValueError(f"Input objects must have the same number of rows {n_rows}, {elem_rows}.") + + +def _sample(x: Union[np.ndarray, pd.DataFrame, pd.Series], ind_list: Any) -> Union[np.ndarray, pd.DataFrame]: + """ + Samples the rows of a numpy array or pandas DataFrame based on a list of indices. + Args: + x: The array or DataFrame to be shuffled. + ind_list: The list or array of indices that defines the new order of the rows. + Returns: + The shuffled array or DataFrame. + Raises: + TypeError: If the input is neither a numpy array nor a pandas dataframe + """ + if isinstance(x, np.ndarray): + return x[ind_list] + elif isinstance(x, pd.DataFrame): + return x.iloc[ind_list] + elif isinstance(x, pd.Series): + return x.iloc[ind_list] + else: + raise TypeError("Input must be a numpy array, pandas DataFrame, or pandas Series") + + +def sample_rows(*args: Any, sample: Optional[int] = None, shuffle: bool = True) -> tuple[Any]: + """ + Samples rows of the provided objects in the same way and optionally shuffles them. + Tries to minimize the amount of rows containing nan. + Args: + *args: Input tabular data objects. + sample: Number of samples to draw from each object. If None, no sampling is done. + shuffle: If True and sample is None, shuffles the objects. + + Returns: + Tuple of shuffled and/or sampled objects. + + Raises: + ValueError: If input objects don't have the same number of rows. + """ + _verify_same_number_of_rows(*args) + n_rows = _get_shape(args[0])[0] + nan_mask = nan_rows_mask(*args) + full_indices = np.where(~nan_mask)[0] + n_full_rows = len(full_indices) + if sample is not None and sample < n_full_rows: + indices = np.random.choice(full_indices, size=sample, replace=False) + if not shuffle: + indices.sort() + elif sample is not None and sample < n_rows: + indices = np.concatenate((np.random.choice(full_indices, size=n_full_rows, replace=False), np.random.choice(np.where(nan_mask)[0], size=sample - n_full_rows, replace=False))) + if not shuffle: + indices.sort() + else: + indices = np.arange(n_rows) + if shuffle: + indices = np.random.choice(indices, size=n_rows, replace=False) + + results = tuple(_sample(arg, indices) for arg in args) + return results + + +def nan_rows_mask(*args: Any) -> np.ndarray: + """ + Given a list of 2D numpy arrays or DataFrames with the same number of rows, return a boolean mask that is True for every row where at least one of the objects has a NaN value. + Args: + *data_list: List of 2D numpy arrays or DataFrames of same amount of rows. + Returns: + Boolean mask indicating rows with at least one NaN. + Raises: + Nothing + """ + # Initialize the mask with False values + _verify_same_number_of_rows(*args) + n_rows = _get_shape(args[0])[0] + mask = np.zeros(n_rows, dtype=bool) + for data in args: + if isinstance(data, np.ndarray): + if data.ndim == 1: + data = data.reshape(-1,1) + mask |= np.isnan(data).any(axis=1) + elif isinstance(data, pd.DataFrame): + mask |= data.isna().to_numpy().any(axis=1) + elif isinstance(data, pd.Series): + mask |= data.isna().to_numpy() + else: + data = np.array(data) + if data.ndim == 1: + data = data.reshape(-1,1) + mask |= np.isnan(data).any(axis=1) + return mask + + +def _to_series(data: Union[pd.Series, pd.DataFrame, np.ndarray]): + """ + Convert + Args: + *data_list: List of 2D numpy arrays or DataFrames of same amount of rows. + Returns: + Boolean mask indicating rows with at least one NaN. + Raises: + Nothing + """ + if isinstance(data, pd.Series): + return data + elif isinstance(data, pd.DataFrame): + if data.shape[1] != 1: + raise ValueError("DataFrame must have exactly one column to convert to Series") + return data.iloc[:, 0] + elif isinstance(data, np.ndarray): + if data.ndim == 1: + return pd.Series(data) + elif data.ndim == 2 and data.shape[1] == 1: + return pd.Series(data.ravel()) + else: + raise ValueError("ndarray must be 1-dimensional or a 2-dimensional single column array") + + +def _is_convertible_to_numpy_array(obj: Any) -> bool: + """ + Verifies a given object is convertible to a numpy array without error + Args: + obj: object to check + Returns: + bool + Raises: + Nothing + """ + if isinstance(obj, (list, tuple, dict, set)): + return True + if np.isscalar(obj): + return True + if hasattr(obj, '__array__'): + return True + return False + + +def split_rows(data: Union[pd.DataFrame, np.ndarray], bool_array: np.ndarray, drop_index: bool = True) -> list[Union[pd.DataFrame, np.ndarray]]: + """ + Splits a pandas DataFrame or a numpy array based on a boolean array indicator. + Args: + data : The input data to split. Can be a pandas DataFrame or a numpy ndarray. + bool_array : A 1D boolean array indicating where splits should occur. Must be the same length as `data`. + drop_index : Whether to reset the index in the resulting DataFrame splits. Default is True. + Returns: + A list of the resulting split pd.DataFrames np.ndarrays. + Raises + ValueError: If the length of `bool_array` does not match the length of `data`. + """ + if len(bool_array) != len(data): + raise ValueError("The length of bool_array must match the length of data.") + + indices = np.where(bool_array)[0] + indices = np.concatenate(([0], indices, [len(data)])) + + if isinstance(data, pd.DataFrame): + return [data.iloc[start:end].reset_index(drop=drop_index) + for start, end in zip(indices[:-1], indices[1:]) + if start != end] + elif isinstance(data, np.ndarray): + return [data[start:end] + for start, end in zip(indices[:-1], indices[1:]) + if start != end] diff --git a/notebooks/demo_ppscore.ipynb b/notebooks/demo_ppscore.ipynb new file mode 100644 index 0000000..cc7ab04 --- /dev/null +++ b/notebooks/demo_ppscore.ipynb @@ -0,0 +1,818 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "c:\\Users\\Edouard\\Documents\\Git\\microwave\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\IPython\\core\\magics\\osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", + " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" + ] + } + ], + "source": [ + "%cd ..\n", + "import microwave.data_analysis.ppscore as pps\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.tree import DecisionTreeRegressor\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
002102
110100
220100
301001
410110
500200
600210
701111
820110
922202
1020120
1110120
1221021
1312122
1421001
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "0 0 2 1 0 2\n", + "1 1 0 1 0 0\n", + "2 2 0 1 0 0\n", + "3 0 1 0 0 1\n", + "4 1 0 1 1 0\n", + "5 0 0 2 0 0\n", + "6 0 0 2 1 0\n", + "7 0 1 1 1 1\n", + "8 2 0 1 1 0\n", + "9 2 2 2 0 2\n", + "10 2 0 1 2 0\n", + "11 1 0 1 2 0\n", + "12 2 1 0 2 1\n", + "13 1 2 1 2 2\n", + "14 2 1 0 0 1" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n", + "df['E'] = df['B']\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ppscorecasemetricperfect_scorenaive_scoremodel_scoremodel
00.128681regressionmean_squared_error0.01.0666670.929407DecisionTreeRegressor
\n", + "
" + ], + "text/plain": [ + " ppscore case metric perfect_score naive_score \\\n", + "0 0.128681 regression mean_squared_error 0.0 1.066667 \n", + "\n", + " model_score model \n", + "0 0.929407 DecisionTreeRegressor " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pps.score(df['A'], df['B'], metric=mean_squared_error, model=DecisionTreeRegressor())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xppscorecasemetricperfect_scorenaive_scoremodel_scoremodel
0A1.0regressionmean_squared_error0.00.7333330.000000DecisionTreeRegressor
1B0.0regressionmean_squared_error0.00.7333331.211852DecisionTreeRegressor
2C0.0regressionmean_squared_error0.00.7333331.109609DecisionTreeRegressor
3D0.0regressionmean_squared_error0.00.7333330.866667DecisionTreeRegressor
4E0.0regressionmean_squared_error0.00.7333331.211852DecisionTreeRegressor
\n", + "
" + ], + "text/plain": [ + " x ppscore case metric perfect_score naive_score \\\n", + "0 A 1.0 regression mean_squared_error 0.0 0.733333 \n", + "1 B 0.0 regression mean_squared_error 0.0 0.733333 \n", + "2 C 0.0 regression mean_squared_error 0.0 0.733333 \n", + "3 D 0.0 regression mean_squared_error 0.0 0.733333 \n", + "4 E 0.0 regression mean_squared_error 0.0 0.733333 \n", + "\n", + " model_score model \n", + "0 0.000000 DecisionTreeRegressor \n", + "1 1.211852 DecisionTreeRegressor \n", + "2 1.109609 DecisionTreeRegressor \n", + "3 0.866667 DecisionTreeRegressor \n", + "4 1.211852 DecisionTreeRegressor " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pps.predictors(df, df['A'], metric=mean_squared_error, model=DecisionTreeRegressor())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xyppscorecasemetricperfect_scorenaive_scoremodel_scoremodel
0BE1.000000regressionmean_squared_error0.01.0666670.000000DecisionTreeRegressor
1EB1.000000regressionmean_squared_error0.01.0666670.000000DecisionTreeRegressor
2AB0.197917regressionmean_squared_error0.01.0666670.855556DecisionTreeRegressor
3AE0.197917regressionmean_squared_error0.01.0666670.855556DecisionTreeRegressor
4EC0.197279regressionmean_squared_error0.00.4000000.321088DecisionTreeRegressor
5BC0.197279regressionmean_squared_error0.00.4000000.321088DecisionTreeRegressor
6DB0.154167regressionmean_squared_error0.01.0666670.902222DecisionTreeRegressor
7DE0.154167regressionmean_squared_error0.01.0666670.902222DecisionTreeRegressor
8CB0.134706regressionmean_squared_error0.01.0666670.922980DecisionTreeRegressor
9CE0.134706regressionmean_squared_error0.01.0666670.922980DecisionTreeRegressor
10CA0.000000regressionmean_squared_error0.00.7333331.219813DecisionTreeRegressor
11BA0.000000regressionmean_squared_error0.00.7333331.210204DecisionTreeRegressor
12EA0.000000regressionmean_squared_error0.00.7333331.210204DecisionTreeRegressor
13DA0.000000regressionmean_squared_error0.00.7333331.200963DecisionTreeRegressor
14DC0.000000regressionmean_squared_error0.00.4000000.549704DecisionTreeRegressor
15AC0.000000regressionmean_squared_error0.00.4000000.550000DecisionTreeRegressor
16ED0.000000regressionmean_squared_error0.00.7333331.107861DecisionTreeRegressor
17CD0.000000regressionmean_squared_error0.00.7333331.070089DecisionTreeRegressor
18BD0.000000regressionmean_squared_error0.00.7333331.107861DecisionTreeRegressor
19AD0.000000regressionmean_squared_error0.00.7333331.096296DecisionTreeRegressor
\n", + "
" + ], + "text/plain": [ + " x y ppscore case metric perfect_score \\\n", + "0 B E 1.000000 regression mean_squared_error 0.0 \n", + "1 E B 1.000000 regression mean_squared_error 0.0 \n", + "2 A B 0.197917 regression mean_squared_error 0.0 \n", + "3 A E 0.197917 regression mean_squared_error 0.0 \n", + "4 E C 0.197279 regression mean_squared_error 0.0 \n", + "5 B C 0.197279 regression mean_squared_error 0.0 \n", + "6 D B 0.154167 regression mean_squared_error 0.0 \n", + "7 D E 0.154167 regression mean_squared_error 0.0 \n", + "8 C B 0.134706 regression mean_squared_error 0.0 \n", + "9 C E 0.134706 regression mean_squared_error 0.0 \n", + "10 C A 0.000000 regression mean_squared_error 0.0 \n", + "11 B A 0.000000 regression mean_squared_error 0.0 \n", + "12 E A 0.000000 regression mean_squared_error 0.0 \n", + "13 D A 0.000000 regression mean_squared_error 0.0 \n", + "14 D C 0.000000 regression mean_squared_error 0.0 \n", + "15 A C 0.000000 regression mean_squared_error 0.0 \n", + "16 E D 0.000000 regression mean_squared_error 0.0 \n", + "17 C D 0.000000 regression mean_squared_error 0.0 \n", + "18 B D 0.000000 regression mean_squared_error 0.0 \n", + "19 A D 0.000000 regression mean_squared_error 0.0 \n", + "\n", + " naive_score model_score model \n", + "0 1.066667 0.000000 DecisionTreeRegressor \n", + "1 1.066667 0.000000 DecisionTreeRegressor \n", + "2 1.066667 0.855556 DecisionTreeRegressor \n", + "3 1.066667 0.855556 DecisionTreeRegressor \n", + "4 0.400000 0.321088 DecisionTreeRegressor \n", + "5 0.400000 0.321088 DecisionTreeRegressor \n", + "6 1.066667 0.902222 DecisionTreeRegressor \n", + "7 1.066667 0.902222 DecisionTreeRegressor \n", + "8 1.066667 0.922980 DecisionTreeRegressor \n", + "9 1.066667 0.922980 DecisionTreeRegressor \n", + "10 0.733333 1.219813 DecisionTreeRegressor \n", + "11 0.733333 1.210204 DecisionTreeRegressor \n", + "12 0.733333 1.210204 DecisionTreeRegressor \n", + "13 0.733333 1.200963 DecisionTreeRegressor \n", + "14 0.400000 0.549704 DecisionTreeRegressor \n", + "15 0.400000 0.550000 DecisionTreeRegressor \n", + "16 0.733333 1.107861 DecisionTreeRegressor \n", + "17 0.733333 1.070089 DecisionTreeRegressor \n", + "18 0.733333 1.107861 DecisionTreeRegressor \n", + "19 0.733333 1.096296 DecisionTreeRegressor " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pps_mat = pps.matrix(df, metric=mean_squared_error, model=DecisionTreeRegressor())\n", + "pps_mat" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAhcAAAG2CAYAAADMcaSeAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAQcVJREFUeJzt3Qd4FNX6x/FfaKFJCQkJcFFQFP5K0yBFrIgCUr2giAIRhIsgqERUQkdFFKQoBlCkWOgoHUHKVVB66EKAUKQmIQQIUgJk9//McFlYCJiE2Wx29/u5zzxmzp6ZnZy7G959z3tm/ex2u10AAAAWyWbViQAAAAwEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAwFIEFwAAeKnly5erYcOGKl68uPz8/DRr1qx/PObXX3/VQw89JH9/f5UpU0YTJkxI9/MSXAAA4KXOnDmjSpUqKTIyMk399+3bp/r16+upp57Spk2b9Pbbb6tdu3ZatGhRup7Xjy8uAwDA+/n5+WnmzJlq0qTJTfu8//77mj9/vrZt2+Zoe+mll3Ty5EktXLgwzc9F5gIAAA+RnJyspKQkp81os8qqVatUu3Ztp7Y6deqY7emRw7IrgtfIkauEuy8BADLNpQuHXf4cFxP2WnKegV9+p/79+zu19e3bV/369bPk/LGxsQoODnZqM/aNIObcuXPKkydPms5DcAEAgIeIiIhQeHi4U5tReJnVEFwAAOBqthRLTmMEEq4MJkJCQhQXF+fUZuwXKFAgzVkLA8EFAACuZrfJE9SoUUMLFixwalu8eLHZnh4UdAIA4Go2mzVbOv3999/mklJju7LU1Pj5wIEDjmmW1q1bO/q//vrr2rt3r9577z1FR0dr5MiRmjZtmrp27Zqu5yW4AADAS61fv14PPviguRmMeg3j5z59+pj7R48edQQahtKlS5tLUY1shXF/jCFDhuibb74xV4ykB/e5wA1YLQLAl2TGapELR/605Dy5ij8gT0DNBQAArmbzjJoLqzAtAgAALEXmAgAAV7P7VuaC4AIAAA+5z4WnYFoEAABYiswFAACuZmdaBAAAWMnmW8EF0yIAAMBSZC4AAHAxO9MiAADAUjaCCwAAYCW7bwUX1FwAAABLkbkAAMDVbL51Ey2CCwAAXM3OtAgAAECGkbkAAMDVbL6VuSC4AADA1ey+FVwwLQIAACxF5gIAAFez+VbmguACAAAXs9t9aykq0yIAAMBSZC4AAHA1O9MiAADASjaCCwAAYCW7bwUX1FzArTq+HqaYXav1d9Ierfx9rh6uUvmW/Zs2baBtW38z+2/csET16ta6oU+/vt108K8NOn0qRot+nqIyZUrLGzF2t4fxyzjGDv+E4AJu88ILjfTZ4L768KOherhaXW3esl0L5k9UUFCRVPvXqF5FE7+P1Pjxk1Wlah3NmbNIP84YqwceKOvo8263Tur8Rlt16txdjzzaUGfOntWCeRPl7+8vb8LY3R7GL+MYu9v44jKbBZuH8LPb7XZlUdu2bVP58uXdfRk+J0euEpnyPMYnnnXrN+utt3uZ+35+ftq/d50iR47XoMGRN/SfNHGU8uXNq8bPhzna/lgxV5s2/6k3Onc3941PPsOGf6Whw74y9wsUuENHDm1S23ZdNW3aHHkLxu72MH4Z541jd+nCYZc/x/m10y05T+6qL8gTZLnMxenTp/X111+ratWqqlSpkrsvBy6SM2dOPfRQRS1dtsLRZsS5S5f9rurVQ1M9pnq1UKf+hl8W/+roX7r0nSpWLNg8xxVJSae1du1G81hvwdjdHsYv4xg7eFxwsXz5coWFhalYsWL67LPPVKtWLa1evdrdlwUXCQwMUI4cORQfl+DUHh9/TCHBQakeExISpLj4Y05tcXEJjv4hwUX/13Zdn/gEhYRcfswbMHa3h/HLOMbuNleL2CzYPIRbV4vExsZqwoQJGjt2rJKSkvTiiy8qOTlZs2bN0v3335+mcxj9je1axjydV83VAQA8m91zAgOPzlw0bNhQZcuW1ZYtWzR8+HAdOXJEI0aMSPd5Bg4cqIIFCzptRhuytoSERF26dElFgwOd2osWDVLsdZ9groiNPabgos6fjoKDAx39Y+Pi/9d2XZ+igYqNvfyYN2Dsbg/jl3GMHbJ8cPHzzz/rtddeU//+/VW/fn1lz549Q+eJiIjQqVOnnDajDVnbxYsXtWHDFtV66lFHm1EYZuyvXh2V6jGr10SpVq2r/Q21n37c0X/fvgM6ejTO6Zx33JFfVas+aB7rLRi728P4ZRxjdxtsTItkit9//92cDgkNDdX//d//qVWrVnrppZfSfR6mQDzXsM/HaPzYYYrasEXr1m3Um13aK1++PJrw7VTz8fHjPteRI0fVs9cn5v6IEWO1bOkMdX27gxb8vETNX2ys0NCKer3Te45zfjHiG/WIeFO7Y/Zq//6D6t/vXR05EqfZsxfJmzB2t4fxyzjGLoNsnhMYeHRwUb16dXMzpkSmTp2qcePGKTw8XDabTYsXL1bJkiV1xx13uOvykAmmT5+joMAA9evTzSz62rz5T9Vv0FLx8ZeLxe4sWdx8PVyxavV6tWzdWR/0f08fffi+dsfsU9Nmr+nPP3c6+gz+bKTy5cur0SMHqVChAvrjj3Wq37DlDXU5no6xuz2MX8YxdvC4+1zs3LnTzGZ8//33OnnypJ555hnNmeM968M9RWbd5wIAsoLMuM/FueUTLDlPnsdflSfIMktRDUaB56BBg3To0CFNnjzZ3ZcDAIA1bL5Vc5GlMhfIGshcAPAlmZK5+O83lpwnz1Pt5AmyVOYCAAB4Pr5yHQAAV7N5zpSGFQguAABwNbtvBRdMiwAAAEuRuQAAwNVsvpW5ILgAAMDV7L4VXDAtAgAALEXmAgAAV7P5VuaC4AIAAFez+VZwwbQIAACwFJkLAABcze5bmQuCCwAAXM1GcAEAAKxk963ggpoLAABgKTIXAAC4ms23MhcEFwAAuJrdt4ILpkUAAIClyFwAAOBqNt/KXBBcAADgajbfCi6YFgEAAJYicwEAgKvZ7fIlBBcAALiajWkRAACADCNzAQCAq9l8K3NBcAEAgKvZCS4AAICVbL4VXFBzAQCAF4uMjFSpUqWUO3duVatWTWvXrr1l/+HDh6ts2bLKkyePSpYsqa5du+r8+fPpek6CCwAAMmMpqt2CLZ2mTp2q8PBw9e3bVxs2bFClSpVUp04dxcfHp9p/0qRJ6t69u9l/x44dGjt2rHmOHj16pOt5CS4AAMiMaRGbBVs6DR06VO3bt1ebNm10//33a/To0cqbN6/GjRuXav+VK1eqZs2aevnll81sx7PPPqsWLVr8Y7bjegQXAAB4iOTkZCUlJTltRltqLly4oKioKNWuXdvRli1bNnN/1apVqR7zyCOPmMdcCSb27t2rBQsW6LnnnkvXdXplQWeDO+u7+xI82qULh919CR6rSrHH3H0JHss/W053X4JH+3XzN+6+BGRCQefAgQPVv39/pzZjCqNfv3439E1ISFBKSoqCg4Od2o396OjoVM9vZCyM4x599FHZ7XZdunRJr7/+OtMiAABkyaWo9tvfIiIidOrUKafNaLPKr7/+qo8//lgjR440azR++uknzZ8/Xx9++GG6zuOVmQsAALyRv7+/uaVFYGCgsmfPrri4OKd2Yz8kJCTVY3r37q1WrVqpXbt25n6FChV05swZ/ec//1HPnj3NaZW0IHMBAICL2W12S7b0yJUrl0JDQ7V06VJHm81mM/dr1KiR6jFnz569IYAwAhTzd0jHahUyFwAAeOlNtMLDwxUWFqYqVaqoatWq5j0sjEyEsXrE0Lp1a5UoUcKs5TA0bNjQXGHy4IMPmvfEiImJMbMZRvuVICMtCC4AAPBSzZs317Fjx9SnTx/FxsaqcuXKWrhwoaPI88CBA06Zil69esnPz8/87+HDhxUUFGQGFgMGDEjX8/rZ05Pn8BCsFrk98w7Md/cleCxWi2Qcq0VuD6tFMi5n4N0uf46zo7pYcp68HUfIE5C5AADA1Wxe9zn+lgguAABwNRtfXAYAAJBhZC4AAHA1m29lLgguAABwNbtv1VwwLQIAACxF5gIAAFezMS0CAACsZGNaBAAAIMPIXAAA4Gp2pkUAAICVbEyLAAAAZBiZCwAAXMzOahEAAGApm29NixBcAADganbfylxQcwEAACxF5gIAAFezMS0CAACsZGNaBAAAIMPIXAAA4Go2pkUAAICV7EyLAAAAZBiZCwAAXM3GtAgAALCQ3cdWixBcWKB+6/r6d4emKhxUWPt27NNXfUZr1+Zdqfat06KOajWtpbvKljL3Y7bG6LtPv72h/yvhLVXn5TrKVyCfdqzfoZE9InVk/5FM+X3gGV549Xm16tRCRYICtHv7Hg3uOVx/btqRat+77yul1997TeUqllXxksU0pM8XmjxmulOfpq2bqFlYExUrGWLu7925T98Mm6CVy9bIG/07rLFe7thcAUEBitm+R8N6j9COTdGp9i19Xym16/aqyla8zxyfz/tGato3Pzr1aRseptfeCXNq+yvmgF5+4lV5k/Wbtmr8pBnaHh2jY8cT9fnA3nr68UdueczaDVs0eMTXitn3l0KKBqlDWAs1qf+MU5/JP841z5uQeEJly9ytHl07qsL9ZV3828BVqLm4TY81fEzterfX5OGT9Fb9N83g4oMfPlTBIgVT7V+hegX9Nnu5IppHqFuTd3TsyDGzf5HgIo4+TTs2U8M2DRUZEal3GoXr/NnzZp+c/jkz8TdDVvZMo1rq2q+zxgyZoJZ12mnX9hiNmDxEhYsUSrV/7jy5deivo/pywFdKiDueap/4o/H6csBotarTTq3rttf6PzZoyPiBZmDibZ5u9KS69O2ocUO/U9u6HczgYujET1XoJuPnn8dfRw4c1aiPx9x0/Ax7o/epYeWmjq1jkzflbc6dO2/+49/znU5p6n/oSKzeeLePqj5USTMmRKrVi03U99Ph+mNNlKPPz0t+06ARX6tj21c0fdwIlS1TWh3Ce+n4iZPyqmkRmwWbhyC4uE1N2j2vRZMXasn0JTq4+6AiI75U8rnzeqb5s6n2/+ytz7Tg+/nat32vDu05pBHvfaFs2bKp0qOVHH0av9ZYU0dM1ZrFq7U/er+Gdh2igKIBqvFsjUz8zZCVvdKhuWZNnKu5Uxdo3679GvjeZzp/7rwataifav/tm6P1xYcj9cvspbpw4UKqfVYsXqk/lq3WwX2HdGDvQY38ZIzOnjmnCqEPyNs0b/+C5k5aoAXTFmr/7r80uPswJZ9LVoOX6qXaP3rzTkV+9JWWzvmvLl64eNPzpqSkKPHYCcd26kSSvM1jNR7Wm/8JU+0naqap/7RZ81WiWIje7dJe95S6Uy83a6RnnnxU302d6ehj/NysYT09X/9Z3VP6LvV5t4ty+/tr5rxf5DVsBBdIoxw5c6hMhTLa9PsmR5vdbjf3yz1ULk3nMD4RZc+ZXadPnjb3g+8MMQOJa8959vRZ7dy0U+VC03ZOeP/rrlzF+7RmRZTT627tivWqaFEgYAS8zzZ+Wnny5taWqD/lbeNnTG+su2781v8epfKh99/Wuf9VuoRmR03TtJU/qO+IHgouXlS+bvO2aFWvUtmprWa1UG3ednkK7+LFi9q+c7eqP1zZ6fVnHHOlj9csRbVbsHmILFFzcfz4cRUpcnla4ODBgxozZozOnTunRo0a6bHHHlNWVSCggLLnyK6TCc6pO2P/X/eUTNM5Xo1oo8S4REcwYdRtXD7HiRvOWeh/j8G3FQooqBw5cijxWKJTu/FJuVSZu27r3PeUu1vj541SLv9cOnfmnN5t29PMjHjf+GVX4nXvMWP87rznzgyfd/vGHRrQdZAO7DmoIkUDzBqMkTM/V6tabc0MkK8yaiiKBDj/7SpSuJD+PnNW55OTlZT0t1JSbDf2CSisfQcOZfLVwiuCi61bt6phw4ZmQHHvvfdqypQpqlu3rs6cOWNGrsOGDdOMGTPUpEmTm54jOTnZ3K6VYk9Rdr/syuqadXpBjzd6XBEvdtfF5JunWoHM8teeA3q5dlvlL5BPTzd4Sv2+6Kn//LuL1wUYrrD6v2sdP+/ZsdcMNn5cM1m1Gj6peVN+duu1IQuwec6UhsdPi7z33nuqUKGCli9frieffFINGjRQ/fr1derUKZ04cUIdOnTQJ598cstzDBw4UAULFnTa9iTtyZTrT0pMUsqlFBUKdC4CM/ZPHHP+VHS95//zbzXr2Ey9W/Yy6yquuHJcocDCN5zz5D+cE77hZOIpXbp0yVzlcK2AoMI6Hn/zYsO0uHTxkg7tP6zoLbsU+fFX2vVnjFq0aybvG78UBVz3HjPG7/ps0O34O+mMDu49pH+VKiFfFhhQWMcTnf92GYWa+fPlNesqChcqoOzZs93YJ/GEeay3sNvslmyewq3Bxbp16zRgwADVrFlTn332mY4cOaJOnTqZWQtj69Kli6KjU18adkVERIQZjFy73VPgnky5fuMPsbGUtFLNq3OFfn5+5n70hptfd9PXm+qlN19S39Z9FLMlxumxuAOxSoxPVOWaVws88+TPo7KVyyo66tZjAd9gvO6Mf/yrPhrq9Lp7+NFQy+sjsmXzU85cueRt47dzyy5VefQhp/ELffQhbYvabtnzGPUqJe4qroTbDPg8XaXy5bQmarNT26p1G1Wp/P+ZP+fMmVP3l71Xa9ZfrTOz2WxaE7XJ0Qeex63TIomJiQoJubymPn/+/MqXL58KF74aqRo/nz59udDxZvz9/c3tWpk5JTLrm5nqOiRcu7fu1q5Nu8yVHrnz5taSaYvNx8OHhet47HF9++m3jmWmLcNbavCbgxR3KN5RR3H+zDlzyalh9tjZav7mSzq8/4gZbLTs1soMOFb9sirTfi9kbRO/mqp+n/cwV4EY97Z4uf0LypM3j+ZOWWA+3v+LnoqPTTCzD1eKGK8sKTX+mAeFBOm+B8qYtQBGpsLwRo8OWrlstWIPxSlv/ryq++9nFPrIg+rS4h15m6ljpqvnsO6K3rJT2zdG68X2Tc3luvOnLjQf7/V5dyUcTdDoT75xjF/p+y7Xs+TMmUNBIYG694F7zPEz3qeGN3q/rj8WrzTHLzAkUO3eCVOKzaYls5bJm5w9e04HDl29587hI3GK3rVHBQvcoWIhRTVs1HjFJxzXwN7dzMdfbFLfvIfFkMixer7Bs1obtVmLli3XyMEfOM7Ruvnz6jlgiB4od6/K319WP0ybpXPnk2+4F4ZHs3lO1sErCjqNTwy32s/qVsxdoYIBBc2AwSjG3Lt9r/q06uMo8gwqHiTbNS+q51o+Z96vosdXPZ3OM2nYRE0aNsn8+cdRM8w/dF0GdjFvorV9/Xb1adWbugw4LJ6zzLynhXFjLOMmWsb0RZeXuzmKFENKBDu97oKCAzVpyXjHfutOLcwtauVGdWh6+V4MAUUKmUFJYNEi+vv0GfPGXEZgsWb5enmbpXN+VaGAQmrXrY05HbL7zz16p+X7OvG/8TNWeVx7R8XA4CKa8MsYx75x8y1j27Byk7q8EG62FS0WqP6RvVSgcAFz6mXL2q3q0LCz+bM32Ra9W227vO/YN+5PYWhcr7YG9HpHCccTdTQu3vH4v4qHKHLwBxr0xVf6YfosBQcFqv/7b5srRq6oV/sJnTh5Sl9+84MSEhNV7t57NHrIh141LSIfu0Onn91Yg+UmxtRHvXr1HJmHuXPnqlatWmYGw2AUai5cuNBcO54eDe5Mfa0/0mbegfnuvgSPVaVY1l3dlNX5Z+Mmcbfj182XsyxIv5yBd7v8OU53fs6S89zx5eXsZFbn1sxFWJjzrXJbtmx5Q5/WrVtn4hUBAOACNqZFMs348VfTtAAAeC2bbwUX3KETAAB4V0EnAADezu6+8ka3ILgAAMDVbAQXAADASjbfCi6ouQAAAJYicwEAgIvZfSxzQXABAICr2XwruGBaBAAAWIrMBQAArmaTTyG4AADAxexMiwAAAGQcmQsAAFzN5luZC4ILAABczSafwrQIAACwFJkLAABczM60CAAAsJRNPoXgAgAAF7P7WOaCmgsAAGApMhcAALiaTT6F4AIAABez+1hwwbQIAACwFJkLAABczSafQnABAICL2X0suGBaBAAAWIrMBQAArmaTTyG4AADAxew+FlwwLQIAQCYEF3YLtoyIjIxUqVKllDt3blWrVk1r1669Zf+TJ0/qjTfeULFixeTv76/77rtPCxYsSNdzkrkAAMBLTZ06VeHh4Ro9erQZWAwfPlx16tTRzp07VbRo0Rv6X7hwQc8884z52IwZM1SiRAn99ddfKlSoULqel+ACAAAvnRYZOnSo2rdvrzZt2pj7RpAxf/58jRs3Tt27d7+hv9GemJiolStXKmfOnGabkfVIL6ZFAABwNbufJVtycrKSkpKcNqMtNUYWIioqSrVr13a0ZcuWzdxftWpVqsfMmTNHNWrUMKdFgoODVb58eX388cdKSUlJ16/rlZmLhbGb3H0JHq3BnfXdfQkea/3RFe6+BI+VI1cJd1+CR3v+oS7uvgSPNe/AfHmKgQMHqn///k5tffv2Vb9+/W7om5CQYAYFRpBwLWM/Ojo61fPv3btXy5Yt0yuvvGLWWcTExKhTp066ePGi+Tw+HVwAAOCN0yIRERFmDcW1jKJLq9hsNrPe4uuvv1b27NkVGhqqw4cPa/DgwQQXAABkJXabnyXnMQKJtAYTgYGBZoAQFxfn1G7sh4SEpHqMsULEqLUwjrvi//7v/xQbG2tOs+TKlStNz03NBQAAXihXrlxm5mHp0qVOmQlj36irSE3NmjXNqRCj3xW7du0yg460BhYGggsAALz0Phfh4eEaM2aMvv32W+3YsUMdO3bUmTNnHKtHWrdubU61XGE8bqwWeeutt8ygwlhZYhR0GgWe6cG0CAAALmY3Vnu4QfPmzXXs2DH16dPHnNqoXLmyFi5c6CjyPHDggLmC5IqSJUtq0aJF6tq1qypWrGje58IINN5///10Pa+f3W63y8tQdX576oZUdvcleCxPqjrPanjf3h7et1n7fXu4Ri1LzlNi1TJ5AjIXAAC4mN3HvluE4AIAAA9ZLeIpCC4AAHAxu9cVINwaq0UAAIClyFwAAOBidqZFAACAlew+FlwwLQIAACxF5gIAABez+1hBJ8EFAAAuZmdaBAAAIOPIXAAA4KXfLeIuBBcAALiY3cdu/820CAAAsBSZCwAAXMzGtAgAALCSneACAABYyc5SVAAAgIwjcwEAgIvZuUMnAACwkp1pEQAAgIwjcwEAgIvZWC0CAACsZPex4IJpEQAAYCkyFwAAuJjdx1aLkLmwQMfXwxSza7X+Ttqjlb/P1cNVKt+yf9OmDbRt629m/40blqhe3Vo39OnXt5sO/rVBp0/FaNHPU1SmTGl5o/qt62vsH+P0066ZGjJ7qO6rdN9N+9ZpUUefzvhUU7ZONbePJg1Itf8r4S313frv9eOun8w+xUsVd/FvAU/E+zbjeN9mrObCZsHmKQgubtMLLzTSZ4P76sOPhurhanW1ect2LZg/UUFBRVLtX6N6FU38PlLjx09Wlap1NGfOIv04Y6weeKCso8+73Tqp8xtt1alzdz3yaEOdOXtWC+ZNlL+/v7zJYw0fU7ve7TV5+CS9Vf9N7duxTx/88KEKFimYav8K1Svot9nLFdE8Qt2avKNjR46Z/YsEXx3rph2bqWGbhoqMiNQ7jcJ1/ux5s09O/5yZ+Jshq+N9m3G8b5EWfna79yVrcuQqkWnPZXziWbd+s956u5e57+fnp/171yly5HgNGhx5Q/9JE0cpX968avx8mKPtjxVztWnzn3qjc3dz3/jkM2z4Vxo67Ctzv0CBO3Tk0Ca1bddV06bNcfnvVDfk1p/grGJ84tm9eZdG9xntGLsJayZo7oR5mjFy+j8eny1bNvOT0Og+o7Tsx2Vmm/HJZ+bXMzXz65/M/bx35NUPURM1/J1hWj53uYt/I2negfkufw5vxfv29vC+zdrv2413NrbkPA8emC2vzFyEhYVp+fLb/z972bJluv/++5WUlHTDY6dOndIDDzygFStWKCvLmTOnHnqoopYuu3qdRqy2dNnvql49NNVjqlcLdepv+GXxr47+pUvfqWLFgs1zXJGUdFpr1240j/UWOXLmUJkKZbTp901OY2fsl3uoXJrO4Z/HX9lzZtfpk6fN/eA7QxRQNMDpnGdPn9XOTTtVLjRt54T3432bcbxvM85ut2bzFOkOLox/+GvXrq17771XH3/8sQ4fPpyhJx4+fLjat2+vAgUK3PBYwYIF1aFDBw0dOlRZWWBggHLkyKH4uASn9vj4YwoJDkr1mJCQIMXFH3Nqi4tLcPQPCS76v7br+sQnKCTk8mPeoEBAAWXPkV0nE046tRv7hYMKp+kcr0a0UWJcouOP0pXjTiacuOGchdJ4Tng/3rcZx/s242zUXNzarFmzzICiY8eOmjp1qkqVKqV69eppxowZunjxYprPs3nzZtWtW/emjz/77LOKior6x/MkJyeb2Y9rNy+c6cF1mnV6QY83elwD2n+ki8lpf90BcB/et74jQwWdQUFBCg8PNwOENWvWqEyZMmrVqpWKFy+url27avfu3f94jri4ODM9eTPGJ4tjx5w/BaRm4MCBZqbj2s1uu5xuc7WEhERdunRJRYMDndqLFg1S7HWfYK6IjT2m4KLOn46CgwMd/WPj4v/Xdl2fooGKjb38mDdISkxSyqUUFQos5NRu7J845vwJ5nrP/+ffataxmXq37KX90fsd7VeOKxRY+IZznvyHc8J38L7NON63t3cTLbsFm0+sFjl69KgWL15sbtmzZ9dzzz2nrVu3mrUUw4YNu+WxJUqU0LZt2276+JYtW1SsWLF/vIaIiAhzqubazS/bHcoMRqZmw4YtqvXUo442o7jJ2F+9OvWsy+o1UapV62p/Q+2nH3f037fvgI4ejXM65x135FfVqg+ax3qLSxcvKWZrjCrVrOw0dsZ+9Ibomx7X9PWmeunNl9S3dR/FbIlxeizuQKwS4xNVuWYlR1ue/HlUtnJZRUfd/JzwLbxvM473bcbZfGxaJEdG3phz5szR+PHj9csvv6hixYp6++239fLLLzvqJ2bOnKm2bduaWYybMQKR3r17m1MjuXPndnrs3Llz6tu3rxo0aPCP12Ms87p+qZfxYs8swz4fo/FjhylqwxatW7dRb3Zpr3z58mjCt1PNx8eP+1xHjhxVz16fmPsjRozVsqUz1PXtDlrw8xI1f7GxQkMr6vVO7znO+cWIb9Qj4k3tjtmr/fsPqn+/d3XkSJxmz14kbzLrm5nqOiRcu7fu1q5Nu9T4tcbKnTe3lkxbbD4ePixcx2OP69tPv3UsV2sZ3lKD3xykuEPxjvnY82fOmUvXDLPHzlbzN1/S4f1HzD9aLbu1Mv9wrfpllRt/U2Q1vG8zjvctXBJcGNkEm82mFi1aaO3atapc+cblT0899ZQKFXJOm12vV69e+umnn3Tfffepc+fOKlv28nrx6OhoRUZGKiUlRT179lRWN336HAUFBqhfn25m0dfmzX+qfoOWio+/XCx2Z8ni5nhdsWr1erVs3Vkf9H9PH334vnbH7FPTZq/pzz93OvoM/myk8uXLq9EjB6lQoQL64491qt+wpVlf4k1WzF2hggEFzT88RlHX3u171adVH0exWFDxINlsV+tnnmv5nLnuvcdXzq+LScMmatKwSebPP46aodx5cqvLwC7KVyCftq/frj6tejO/Cye8bzOO923G2OVb0n2fi++//14vvPDCDdmGjPjrr7/MwtBFixY5ijCNrEOdOnXMAKN06dJZfr28N8qs9fLeiPtcZBzv29vD+zZrv29XFmtqyXkeOfqjvDJzYRRuWuWuu+7SggULdOLECcXExJgBhrHEtXBh71l+BACAr8kSX1xmBBMPP/ywuy8DAACXsHtQMabXBBcAAHgzm3wLX1wGAAAsReYCAAAXs4tpEQAAYCGbj61FJbgAAMDFbD6WuaDmAgAAWIrMBQAALmb3scwFwQUAAC5mk29hWgQAAFiKzAUAAC5mZ1oEAABYySbfwrQIAACwFJkLAABczCbfQnABAICL2X2s5oJpEQAAYCkyFwAAuJjNtxIXBBcAALiazcemRQguAABwMbt8CzUXAADAUmQuAABwMZt8C8EFAAAuZvPzrZoLpkUAAIClyFwAAOBidvkWggsAAFzMJt/CtAgAALAUmQsAAFzM5lv1nGQuAADIjDt02izYMiIyMlKlSpVS7ty5Va1aNa1duzZNx02ZMkV+fn5q0qRJup+T4AIAAC81depUhYeHq2/fvtqwYYMqVaqkOnXqKD4+/pbH7d+/X926ddNjjz2WoecluAAAIBNWi9gt2NJr6NChat++vdq0aaP7779fo0ePVt68eTVu3LibHpOSkqJXXnlF/fv31913352h35fgAgCATKi5sFmwJScnKykpyWkz2lJz4cIFRUVFqXbt2o62bNmymfurVq266bV+8MEHKlq0qF577bUM/74UdOIGC2M3ufsSPFaOXCXcfQke69KFw+6+BI/Ga883lqIOHDjQzChcy5jy6Nev3w19ExISzCxEcHCwU7uxHx0dner5f//9d40dO1abNt3evwMEFwAAeIiIiAizhuJa/v7+lpz79OnTatWqlcaMGaPAwMDbOhfBBQAAHnKHTn9//zQHE0aAkD17dsXFxTm1G/shISE39N+zZ49ZyNmwYUNHm812OeeSI0cO7dy5U/fcc0+anpuaCwAAPKTmIj1y5cql0NBQLV261ClYMPZr1KhxQ/9y5cpp69at5pTIla1Ro0Z66qmnzJ9LliyZ5ucmcwEAgJcKDw9XWFiYqlSpoqpVq2r48OE6c+aMuXrE0Lp1a5UoUcKs5TDug1G+fHmn4wsVKmT+9/r2f0JwAQCAl363SPPmzXXs2DH16dNHsbGxqly5shYuXOgo8jxw4IC5gsRqfna73eu+rI2qacDzsFrk9vB3L2u/9r76V0tLztPh0A/yBNRcAAAASzEtAgCAi9l97IvLCC4AAPDSmgt3YVoEAABYiswFAAAuZpNvIbgAAMDF7PItBBcAALiYzccKOqm5AAAAliJzAQCAi9nkWwguAABwMZt8C9MiAADAUmQuAABwMbt8C8EFAAAuZmO1CAAAQMaRuQAAwMVs8i0EFwAAuJhdvoVpEQAAYCkyFwAAuJjNx3IXBBcAALiYTb6F4AIAABezy7dQcwEAACxF5gIAABezybcQXAAA4GI27tAJAACQcQQXFuj4ephidq3W30l7tPL3uXq4SuVb9m/atIG2bf3N7L9xwxLVq1vrhj79+nbTwb826PSpGC36eYrKlCktb8TYZRxjB3fhtZexpag2CzZPQXBxm154oZE+G9xXH340VA9Xq6vNW7ZrwfyJCgoqkmr/GtWraOL3kRo/frKqVK2jOXMW6ccZY/XAA2Udfd7t1kmd32irTp2765FHG+rM2bNaMG+i/P395U0Yu4xj7OAuvPYyxm7R5in87Ha7J11vmuTIVSLTnsuI2tet36y33u5l7vv5+Wn/3nWKHDlegwZH3tB/0sRRypc3rxo/H+Zo+2PFXG3a/Kfe6Nzd3Dei92HDv9LQYV+Z+wUK3KEjhzapbbuumjZtjrwFY5dx3jh2ly4cdvlzeLPM+rvHay9jepZ62ZLzDNg/SZ6AzMVtyJkzpx56qKKWLlvhaDNitaXLflf16qGpHlO9WqhTf8Mvi3919C9d+k4VKxZsnuOKpKTTWrt2o3mst2DsMo6xg7vw2ru91SI2CzZP4fbgwmazady4cWrQoIHKly+vChUqqFGjRvruu+/MF21WFhgYoBw5cig+LsGpPT7+mEKCg1I9JiQkSHHxx5za4uISHP1Dgov+r+26PvEJCgm5/Jg3YOwyjrGDu/Dayzibj9VcuHUpqhE8GIHEggULVKlSJTOwMNp27NihV199VT/99JNmzZp1y3MkJyeb2/XnNVJ1AADAxzIXEyZM0PLly7V06VJt3LhRkydP1pQpU7R582YtWbJEy5YtMzMYtzJw4EAVLFjQabPbTmfK9SckJOrSpUsqGhzo1F60aJBir4vCr4iNPabgos4RfnBwoKN/bFz8/9qu61M0ULGxlx/zBoxdxjF2cBdeexln97GCTrcGF0Yw0aNHDz311FM3PFarVi11795dEydOvOU5IiIidOrUKafNL9sdygwXL17Uhg1bVOupRx1tRsbE2F+9OirVY1aviVKtWlf7G2o//bij/759B3T0aJzTOe+4I7+qVn3QPNZbMHYZx9jBXXjtZZzNx2ou3DotsmXLFg0aNOimj9erV09ffPHFLc9hLFW6frlSZk6JDPt8jMaPHaaoDVu0bt1GvdmlvfLly6MJ3041Hx8/7nMdOXJUPXt9Yu6PGDFWy5bOUNe3O2jBz0vU/MXGCg2tqNc7vec45xcjvlGPiDe1O2av9u8/qP793tWRI3GaPXuRvAljl3GMHdyF117G2Dwq7+DhwUViYqKCg4Nv+rjx2IkTJ5SVTZ8+R0GBAerXp5tZuLR585+q36Cl4uMvFzzdWbK4WbR6xarV69WydWd90P89ffTh+9ods09Nm72mP//c6egz+LORypcvr0aPHKRChQrojz/WqX7DljfUlng6xi7jGDu4C689ZPn7XGTPnl2xsbEKCkq9yjguLk7FixdXSkpKlr3PBQBrcJ+L28Pfvaz92uta6iVLzjNs/xR5ArevFjFWhdzsLmxErQAAb2CTb3FrcBEWdvWObTfTunXrTLkWAADgBcHF+PHj3fn0AABkCjsFnQAAwEo2+Ra33/4bAAB4FzIXAAC4mI1pEQAAYCW7fAvTIgAAwFJkLgAAcDGbj+UuCC4AAHAxm3wLwQUAAC5m97HMBTUXAADAUmQuAABwMZt8C8EFAAAuZmdaBAAAIOPIXAAA4GI2+RaCCwAAXMxmZ1oEAAAgw8hcAADgYnb5FoILAABczOZj4QXTIgAAwFJkLgAAcDG7j2UuCC4AAHAxm3wLwQUAAC5m87HMBTUXAADAUmQuAABwMbuPZS4ILgAAcDGbfAvTIgAAwFIEFwAAuJjdbrdky4jIyEiVKlVKuXPnVrVq1bR27dqb9h0zZowee+wxFS5c2Nxq1659y/43Q3ABAEAmrBaxWbCl19SpUxUeHq6+fftqw4YNqlSpkurUqaP4+PhU+//6669q0aKF/vvf/2rVqlUqWbKknn32WR0+fDhdz+tnz2golIXlyFXC3ZcAIJ0uXUjfHy844+9e1n7tNb6zgSXnmX1gXrr6G5mKhx9+WF9++aW5b7PZzIChS5cu6t69+z8en5KSYmYwjONbt26d5ueloBMAAA8p6ExOTja3a/n7+5vb9S5cuKCoqChFREQ42rJly2ZOdRhZibQ4e/asLl68qICAgHRdp1cGF3VDKrv7EjzazA0j3H0JHqtGhTB3X4LHqlmilrsvwaOR+fGNpagDBw5U//79ndqMKY9+/frd0DchIcHMPAQHBzu1G/vR0dFper73339fxYsXNwMS+XpwAQCAN4qIiDBrKK6VWtbCCp988ommTJli1mEYxaDpQXABAICH3P7b/yZTIKkJDAxU9uzZFRcX59Ru7IeEhNzy2M8++8wMLpYsWaKKFSum+zpZLQIAgBcuRc2VK5dCQ0O1dOlSR5tR0Gns16hR46bHDRo0SB9++KEWLlyoKlWqZOj3JXMBAICX3qEzPDxcYWFhZpBQtWpVDR8+XGfOnFGbNm3Mx40VICVKlDBrOQyffvqp+vTpo0mTJpn3xoiNjTXb8+fPb25pRXABAICXat68uY4dO2YGDEagULlyZTMjcaXI88CBA+YKkitGjRplrjJp1qxZmopGb4bgAgAAL/7iss6dO5tbaoxizWvt37/fkuckuAAAwEMKOj0FBZ0AAMBSZC4AAHAxu/d908YtEVwAAOBiNqZFAAAAMo7MBQAAXrxaxB0ILgAAcDGbj9VcMC0CAAAsReYCAAAXs8u3EFwAAOBiNh8LLwguAABwMZuPBRfUXAAAAEuRuQAAwMXsPrZahOACAAAXszEtAgAAkHFkLgAAcDG7j2UuCC4AAHAxu4/VXDAtAgAALEXmAgAAF7MxLQIAAKxkZ1oEAAAg48hcAADgYjamRQAAgJXsBBcAAMBKNh+ruSC4sED91vX17w5NVTiosPbt2Kev+ozWrs27Uu1bp0Ud1WpaS3eVLWXux2yN0XeffntD/1fCW6rOy3WUr0A+7Vi/QyN7ROrI/iPyJus3bdX4STO0PTpGx44n6vOBvfX044/c8pi1G7Zo8IivFbPvL4UUDVKHsBZqUv8Zpz6Tf5xrnjch8YTKlrlbPbp2VIX7y8rbvPDq82rVqYWKBAVo9/Y9GtxzuP7ctCPVvnffV0qvv/eaylUsq+Ili2lIny80ecx0pz5NWzdRs7AmKlYyxNzfu3Ofvhk2QSuXrZE3+ndYY73csbkCggIUs32PhvUeoR2bolPtW/q+UmrX7VWVrXifOT6f943UtG9+dOrTNjxMr70T5tT2V8wBvfzEqy79PYCsiILO2/RYw8fUrnd7TR4+SW/Vf9MMLj744UMVLFIw1f4VqlfQb7OXK6J5hLo1eUfHjhwz+xcJLuLo07RjMzVs01CREZF6p1G4zp89b/bJ6Z9T3uTcufPmP/493+mUpv6HjsTqjXf7qOpDlTRjQqRavdhEfT8drj/WRDn6/LzkNw0a8bU6tn1F08eNUNkypdUhvJeOnzgpb/JMo1rq2q+zxgyZoJZ12mnX9hiNmDxEhYsUSrV/7jy5deivo/pywFdKiDueap/4o/H6csBotarTTq3rttf6PzZoyPiBZmDibZ5u9KS69O2ocUO/U9u6HczgYujET1XoJuPnn8dfRw4c1aiPx9x0/Ax7o/epYeWmjq1jkzdd+FvA06ZF7Bb8z1MQXNymJu2e16LJC7Vk+hId3H1QkRFfKvnceT3T/NlU+3/21mda8P187du+V4f2HNKI975QtmzZVOnRSo4+jV9rrKkjpmrN4tXaH71fQ7sOUUDRANV4toa8yWM1Htab/wlT7Sdqpqn/tFnzVaJYiN7t0l73lLpTLzdrpGeefFTfTZ3p6GP83KxhPT1f/1ndU/ou9Xm3i3L7+2vmvF/kTV7p0FyzJs7V3KkLtG/Xfg187zOdP3dejVrUT7X/9s3R+uLDkfpl9lJduHAh1T4rFq/UH8tW6+C+Qzqw96BGfjJGZ8+cU4XQB+Rtmrd/QXMnLdCCaQu1f/dfGtx9mJLPJavBS/VS7R+9eaciP/pKS+f8VxcvXLzpeVNSUpR47IRjO3UiyYW/BTxtWsRmweYpCC5uQ46cOVSmQhlt+n2T01pmY7/cQ+XSdA7jE1H2nNl1+uRpcz/4zhAzkLj2nGdPn9XOTTtVLjRt5/RWm7dFq3qVyk5tNauFavO2y1MBFy9e1Padu1X94at9jMDNOOZKH2953ZWreJ/WrIhyet2tXbFeFS0KBIxxe7bx08qTN7e2RP0pb2KMnzG9se668Vv/e5TKh95/W+f+V+kSmh01TdNW/qC+I3oouHhRC64Y8DzUXNyGAgEFlD1Hdp1McE65G/v/uqdkms7xakQbJcYlOoIJo27j8jlO3HDOQv97zFcZNRRFApzHoEjhQvr7zFmdT05WUtLfSkmx3dgnoLD2HTgkb1EooKBy5MihxGOJTu3GJ+VSZe66rXPfU+5ujZ83Srn8c+ncmXN6t21PMzPiTS6PX3YlXvceM8bvznvuzPB5t2/coQFdB+nAnoMqUjTArMEYOfNztarV1swAwbfZPWhKw+MzF88995xOnTrl2P/kk0908uTVf6iPHz+u+++/9SeJZPMflSSnLcWeIk/QrNMLerzR4xrQ/iNdTL55qhXILH/tOaCXa7fVq/U7aMZ3s9Xvi55mMSP+2er/rtV/5/2mPTv2au1v69WtVXflL5BPtRo+6e5LQxZgY1ok8yxatMgMDq74+OOPlZh49dPYpUuXtHPnzlueY+DAgSpYsKDTtidpjzJDUmKSUi6lqFCgcxGYsX/imPOnous9/59/q1nHZurdspdZV3HFleMKBRa+4Zwn/+Gc3i4woLCOJzqPgVGomT9fXrOuonChAsqePduNfRJPmMd6i5OJp8z3hrHK4VoBQYV1PP7mxYZpceniJR3af1jRW3Yp8uOvtOvPGLVo10ze5PL4pSjguveYMX7XZ4Nux99JZ3Rw7yH9q1QJy84JeIpsWele6xm593pERISZ/bh2u6fAPcoMxh9iYylppZpX5/j9/PzM/egNqS9pMzR9valeevMl9W3dRzFbYpweizsQq8T4RFWuebXAM0/+PCpbuayio25+Tl9QqXw5rYna7NS2at1GVSr/f+bPOXPm1P1l79Wa9VfrVWw2m9ZEbXL08QbG6874x7/qo6FOr7uHHw21vD4iWzY/5cyVS97EGL+dW3apyqMPOY1f6KMPaVvUdsuex6hXKXFXcSXcZsAH72D3sdUiHl9z4e/vb27Xyu6XPdOef9Y3M9V1SLh2b92tXZt2mSs9cufNrSXTFpuPhw8L1/HY4/r2028dy0xbhrfU4DcHKe5QvKOO4vyZc+aSU8PssbPV/M2XdHj/ETPYaNmtlRlwrPpllbzJ2bPndODQ1Xt3HD4Sp+hde1SwwB0qFlJUw0aNV3zCcQ3s3c18/MUm9c17WAyJHKvnGzyrtVGbtWjZco0c/IHjHK2bP6+eA4bogXL3qvz9ZfXDtFk6dz75hntheLqJX01Vv897mKtAjHtbvNz+BeXJm0dzpywwH+//RU/FxyaY2YcrRYxXlpQaQVhQSJDue6CMWQtgZCoMb/TooJXLViv2UJzy5s+ruv9+RqGPPKguLd6Rt5k6Zrp6Duuu6C07tX1jtF5s39Rcrjt/6kLz8V6fd1fC0QSN/uQbx/iVvu9yPUvOnDkUFBKoex+4xxw/431qeKP36/pj8Upz/AJDAtXunTCl2GxaMmuZG39TZBU2D5rS8Pjgwvi0YGzXt3mSFXNXqGBAQTNgMIox927fqz6t+jiKPIOKB8lmu/qieq7lc+b9Knp81dPpPJOGTdSkYZPMn38cNcP8Q9dlYBfzJlrb129Xn1a9va4uY1v0brXt8r5j37g/haFxvdoa0OsdJRxP1NG4eMfj/yoeosjBH2jQF1/ph+mzFBwUqP7vv22uGLmiXu0ndOLkKX35zQ9KSExUuXvv0eghH3rVtIhh8Zxl5j0tjBtjGTfRMqYvurzczVGkGFIi2Ol1FxQcqElLxjv2W3dqYW5RKzeqQ9PL92IIKFLIDEoCixbR36fPmDfmMgKLNcvXy9ssnfOrCgUUUrtubczpkN1/7tE7Ld/Xif+Nn7HKw26zOfoHBhfRhF/GOPaNm28Z24aVm9TlhXCzrWixQPWP7KUChQuYUy9b1m5Vh4adzZ8BX+Nnd+P3wBrL3erVq+fIPMydO1e1atVSvnz5zH2jHmPhwoXm2vH0aHBn6mv9kTYzN4xw9yV4rBoVnO/QiLTzz+ZdN4nLbH8cJkOSld0d+KAl59mbsFGewK2Zi7Aw5z/ELVu2vKFP69atM/GKAACwnt1+NRPmC9waXIwffzVNCwCAt7J5UDGmFbhDJwAAsJTHrxYBACCrs7NaBAAAWMnGtAgAAEDGkbkAAMDF7EyLAAAAK9l8LLhgWgQAAFiKzAUAAC5m97GCToILAABczM60CAAAQMaRuQAAwMVsTIsAAAAr2X1sWoTgAgAAF7P5WHBBzQUAALAUmQsAAFzM7mOZC4ILAABczOZjBZ1MiwAAAEuRuQAAwMXsTIsAAAAr2XwsuGBaBAAAWIrMBQAALmb3sYJOggsAAFzMxrQIAABAxpG5AADAxew+lrkguAAAwMXs1FwAAAAr2X0sc0HNBQAAXiwyMlKlSpVS7ty5Va1aNa1du/aW/adPn65y5cqZ/StUqKAFCxak+zkJLgAAyITMhd2CLb2mTp2q8PBw9e3bVxs2bFClSpVUp04dxcfHp9p/5cqVatGihV577TVt3LhRTZo0Mbdt27al63n97F6Yq2lwZ313X4JHm7lhhLsvwWPVqBDm7kvwWP7Zcrr7EjzaH4eXufsScAs5cpWw5DyXLhxOV38jU/Hwww/ryy+/NPdtNptKliypLl26qHv37jf0b968uc6cOaN58+Y52qpXr67KlStr9OjRaX5eMhcAAHiI5ORkJSUlOW1GW2ouXLigqKgo1a5d29GWLVs2c3/VqlWpHmO0X9vfYGQ6btbfpwo65x2Yr6zKeBEMHDhQERER8vf3d/fleBRPGLv1R1coq/KE8cuqGLuMY+wylnG4mX79+ql///5ObcaUh9F+vYSEBKWkpCg4ONip3diPjo5O9fyxsbGp9jfa04PMhRveaMYL42aRJm6Osbs9jF/GMXYZx9hZywjSTp065bQZbVmNV2YuAADwRv7+/mnOAAUGBip79uyKi4tzajf2Q0JCUj3GaE9P/5shcwEAgBfKlSuXQkNDtXTpUkebUdBp7NeoUSPVY4z2a/sbFi9efNP+N0PmAgAALxUeHq6wsDBVqVJFVatW1fDhw83VIG3atDEfb926tUqUKGHWxRjeeustPfHEExoyZIjq16+vKVOmaP369fr666/T9bwEF5nMSGcZxTe+XNiUUYzd7WH8Mo6xyzjGzr2MpaXHjh1Tnz59zKJMY0npwoULHUWbBw4cMFeQXPHII49o0qRJ6tWrl3r06KF7771Xs2bNUvny5dP1vF55nwsAAOA+1FwAAABLEVwAAABLEVwAAABLEVwAAABLEVxkIuPe7MYNTYzlPUi7V199VX5+fo6tSJEiqlu3rrZs2eLuS/MYRpW48UVFd999t1m1b3xxUcOGDW9Yz47UX3c5c+Y0q+ufeeYZjRs3zrxXANL3vr2yGe9deD+Ci0w0duxY8w/88uXLdeTIEXdfjkcx/iAdPXrU3Ix/EHPkyKEGDRq4+7I8wv79+80b6SxbtkyDBw/W1q1bzaVoTz31lN544w13X55HvO6MMfz555/NMTPuA2C89i5duuTuy/Oo9+2VbfLkye6+LGQC7nORSf7++29NnTrVvBmJ8SlywoQJ5hpipI3xafvK7WeN/xpfFfzYY4+Z67eDgoLcfXlZWqdOncxPjGvXrlW+fPkc7Q888IDatm3r1mvzpNedcaOhhx56yPz66aefftp8D7dr187dl+gx4wffQuYik0ybNk3lypVT2bJl1bJlSzO1yi1GMh6o/fDDDypTpow5RYKbS0xMNLMURobi2sDiikKFCrnlujxZrVq1VKlSJf3000/uvhQgyyK4yMQpESOouJIqNL7J7rfffnP3ZXmMefPmKX/+/OZ2xx13aM6cOWYm6No7y+FGMTExZhBrBLawjjGexlQJ0v6+vbJ9/PHH7r4sZAKmRTLBzp07zZT0zJkzzX2jXsC4JasRcDz55JPuvjyPYMx1jxo1yvz5xIkTGjlypOrVq2eO61133eXuy8uyyI65blyNqSak/X17RUBAgNuuB5mH4CITGEGEUfxVvHhxpz9Oxnzkl19+qYIFC7r1+jyBkdI3pkGu+Oabb8xxGzNmjD766CO3XltWZnwvgPGPYHR0tLsvxavs2LFDpUuXdvdleNz7Fr6DnLKLGUHFd999Z37D3KZNmxzb5s2bzWCDyumMMf7BNKZEzp075+5LydKMT4l16tRRZGSk+U2I1zt58qRbrsuTGatujBU3TZs2dfelAFkWmYtMmHM00vivvfbaDRkK44+TkdV4/fXX3XZ9niI5OdlcZWMwxtPI+BiFnca9GnBrRmBRs2ZN8+uWP/jgA1WsWNEMehcvXmymrI1P4bj16y4lJUVxcXFmcazx1dTGUlTjq6qR9vftFca0cGBgoNuuCZmD4MLFjOChdu3aqU59GMHFoEGDzJtBGX/wcXPGH/VixYqZPxsFnUZB3fTp06lZSQPjxlkbNmzQgAED9M4775j3GjCW7xr3vrh+Phypv+6MfxALFy5srhL54osvFBYWRjFxOt+3Vxgr5pim83585ToAALAUoTcAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQUAALAUwQXgxY4dO6aQkBB9/PHHjraVK1cqV65cWrp0qVuvDYD34rtFAC+3YMECNWnSxAwqjC+Nqly5sho3bqyhQ4e6+9IAeCmCC8AHvPHGG1qyZImqVKmirVu3at26dfL393f3ZQHwUgQXgA84d+6cypcvr4MHDyoqKkoVKlRw9yUB8GLUXAA+YM+ePTpy5IhsNpv279/v7ssB4OXIXABe7sKFC6patapZa2HUXAwfPtycGilatKi7Lw2AlyK4ALzcu+++qxkzZmjz5s3Knz+/nnjiCRUsWFDz5s1z96UB8FJMiwBe7NdffzUzFd9//70KFCigbNmymT+vWLFCo0aNcvflAfBSZC4AAIClyFwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABLEVwAAABZ6f8BfR6ppB32V30AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "mat = pps_mat[[\"x\", \"y\", \"ppscore\"]].pivot(columns='x', index='y', values='ppscore')\n", + "_ = sns.heatmap(mat,annot=True,fmt=\".2f\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B', 'E']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pps.mutual_predictors(pps_mat, threshold=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_microwave (3.13.2)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/demo_processing.ipynb b/notebooks/demo_processing.ipynb new file mode 100644 index 0000000..61083b5 --- /dev/null +++ b/notebooks/demo_processing.ipynb @@ -0,0 +1,725 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD0
00222b
11211c
21011d
30010d
42122c
50000d
60222a
70200a
80100a
90221b
102201c
112111a
120102a
132101c
141001b
\n", + "
" + ], + "text/plain": [ + " A B C D 0\n", + "0 0 2 2 2 b\n", + "1 1 2 1 1 c\n", + "2 1 0 1 1 d\n", + "3 0 0 1 0 d\n", + "4 2 1 2 2 c\n", + "5 0 0 0 0 d\n", + "6 0 2 2 2 a\n", + "7 0 2 0 0 a\n", + "8 0 1 0 0 a\n", + "9 0 2 2 1 b\n", + "10 2 2 0 1 c\n", + "11 2 1 1 1 a\n", + "12 0 1 0 2 a\n", + "13 2 1 0 1 c\n", + "14 1 0 0 1 b" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n", + "df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "c:\\Users\\Edouard\\Documents\\Git\\microwave\n" + ] + } + ], + "source": [ + "%cd ..\n", + "import microwave.data_processing as dp" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD0_encoded_a0_encoded_b0_encoded_c0_encoded_d
00.02.02.02.00.01.00.00.0
11.02.01.01.00.00.01.00.0
21.00.01.01.00.00.00.01.0
30.00.01.00.00.00.00.01.0
42.01.02.02.00.00.01.00.0
50.00.00.00.00.00.00.01.0
60.02.02.02.01.00.00.00.0
70.02.00.00.01.00.00.00.0
80.01.00.00.01.00.00.00.0
90.02.02.01.00.01.00.00.0
102.02.00.01.00.00.01.00.0
112.01.01.01.01.00.00.00.0
120.01.00.02.01.00.00.00.0
132.01.00.01.00.00.01.00.0
141.00.00.01.00.01.00.00.0
\n", + "
" + ], + "text/plain": [ + " A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n", + "0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n", + "1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n", + "2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n", + "3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n", + "4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n", + "5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n", + "6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n", + "7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n", + "8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n", + "9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n", + "10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n", + "11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n", + "12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n", + "13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n", + "14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df, trans = dp.df_to_numeric(df)\n", + "new_df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0,\n", + " OneHotEncoder(sparse_output=False),\n", + " ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n", + " ('A',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('B',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('C',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('D',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('0_encoded_a',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('0_encoded_b',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('0_encoded_c',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column']),\n", + " ('0_encoded_d',\n", + " FunctionTransformer(func=. at 0x000001B4F3F920C0>),\n", + " ['column'])]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trans.transforms" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD0_encoded
00.02.02.02.01.0
11.02.01.01.02.0
21.00.01.01.03.0
30.00.01.00.03.0
42.01.02.02.02.0
50.00.00.00.03.0
60.02.02.02.00.0
70.02.00.00.00.0
80.01.00.00.00.0
90.02.02.01.01.0
102.02.00.01.02.0
112.01.01.01.00.0
120.01.00.02.00.0
132.01.00.01.02.0
141.00.00.01.01.0
\n", + "
" + ], + "text/plain": [ + " A B C D 0_encoded\n", + "0 0.0 2.0 2.0 2.0 1.0\n", + "1 1.0 2.0 1.0 1.0 2.0\n", + "2 1.0 0.0 1.0 1.0 3.0\n", + "3 0.0 0.0 1.0 0.0 3.0\n", + "4 2.0 1.0 2.0 2.0 2.0\n", + "5 0.0 0.0 0.0 0.0 3.0\n", + "6 0.0 2.0 2.0 2.0 0.0\n", + "7 0.0 2.0 0.0 0.0 0.0\n", + "8 0.0 1.0 0.0 0.0 0.0\n", + "9 0.0 2.0 2.0 1.0 1.0\n", + "10 2.0 2.0 0.0 1.0 2.0\n", + "11 2.0 1.0 1.0 1.0 0.0\n", + "12 0.0 1.0 0.0 2.0 0.0\n", + "13 2.0 1.0 0.0 1.0 2.0\n", + "14 1.0 0.0 0.0 1.0 1.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n", + "new_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_microwave (3.13.2)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/demo_univariate_aggregates.ipynb b/notebooks/demo_univariate_aggregates.ipynb new file mode 100644 index 0000000..8943c57 --- /dev/null +++ b/notebooks/demo_univariate_aggregates.ipynb @@ -0,0 +1,1234 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "c:\\Users\\Edouard\\Documents\\Git\\microwave\n" + ] + } + ], + "source": [ + "%cd ..\n", + "import microwave.data_analysis.univariate as univariate\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['size', 'non-null', 'nunique', 'sum', 'min', 'max', 'first', 'last', 'mean', 'median', 'mode', 'gmean', 'hmean', 'Pmean', 'geothmetic meandian', 'variance', 'std', 'mad', 'skewness', 'excesskurtosis', 'range', 'Prange', 'n_outliers', 'P75', 'P25', 'P10', 'P90', 'PN', 'skewtest', 'kurtosistest', 'normaltest', 'jarque_bera', 'shapiro', 'anderson', 'energy', 'rms', 'entropy', 'autocorrelation'])" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.AGGFUNCCODES.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
001211
111121
202022
320210
422122
..................
99501221
99612112
99710010
99820110
99922012
\n", + "

1000 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "0 0 1 2 1 1\n", + "1 1 1 1 2 1\n", + "2 0 2 0 2 2\n", + "3 2 0 2 1 0\n", + "4 2 2 1 2 2\n", + ".. .. .. .. .. ..\n", + "995 0 1 2 2 1\n", + "996 1 2 1 1 2\n", + "997 1 0 0 1 0\n", + "998 2 0 1 1 0\n", + "999 2 2 0 1 2\n", + "\n", + "[1000 rows x 5 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(np.random.randint(0,3,size=(1000, 4)), columns=list('ABCD'))\n", + "df['E'] = df['B']\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
size10001000100010001000
non-null10001000100010001000
nunique33333
sum1040102610029891026
min00000
max22222
first01211
last22012
mean1.041.0261.0020.9891.026
median1.01.01.01.01.0
mode21111
gmean0.00.00.00.00.0
hmean0.00.00.00.00.0
Pmean0.00.00.00.00.0
geothmetic meandian0.00.00.00.00.0
variance0.66040.6493240.6619960.6348790.649324
std0.812650.8058060.8136310.7967930.805806
mad1.01.01.01.01.0
skewness-0.073251-0.04714-0.0036610.019674-0.04714
excesskurtosis1.5177821.5415031.5105921.5753461.541503
range22222
Prange0.00.00.00.00.0
n_outliers00000
P752.02.02.02.02.0
P250.00.00.00.00.0
P100.00.00.00.00.0
P902.02.02.02.02.0
PNNaNNaNNaNNaNNaN
skewtest_a-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_b0.3414060.540060.9620240.7980780.54006
kurtosistest_a87.59211992.39696586.38827103.15075692.396965
kurtosistest_b0.00.00.00.00.0
normaltest_a7673.2844258537.5745797462.93545210640.1438298537.574579
normaltest_b0.00.00.00.00.0
jarque_bera_a92.434789.00425992.43290684.63276289.004259
jarque_bera_b0.00.00.00.00.0
shapiro_a0.7938140.796880.794310.800170.79688
shapiro_b0.00.00.00.00.0
anderson_a81.55296180.26539481.25707879.02604880.265394
anderson_b[0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088]
anderson_c[15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0]
energy17421702166616131702
rms1.3198481.3046071.2907361.2700391.304607
entropy1.5831471.5833181.5848881.5816181.583318
autocorrelation-0.008494-0.0010030.001508-0.015942-0.001003
\n", + "
" + ], + "text/plain": [ + " A \\\n", + "size 1000 \n", + "non-null 1000 \n", + "nunique 3 \n", + "sum 1040 \n", + "min 0 \n", + "max 2 \n", + "first 0 \n", + "last 2 \n", + "mean 1.04 \n", + "median 1.0 \n", + "mode 2 \n", + "gmean 0.0 \n", + "hmean 0.0 \n", + "Pmean 0.0 \n", + "geothmetic meandian 0.0 \n", + "variance 0.6604 \n", + "std 0.81265 \n", + "mad 1.0 \n", + "skewness -0.073251 \n", + "excesskurtosis 1.517782 \n", + "range 2 \n", + "Prange 0.0 \n", + "n_outliers 0 \n", + "P75 2.0 \n", + "P25 0.0 \n", + "P10 0.0 \n", + "P90 2.0 \n", + "PN NaN \n", + "skewtest_a -0.951391 \n", + "skewtest_b 0.341406 \n", + "kurtosistest_a 87.592119 \n", + "kurtosistest_b 0.0 \n", + "normaltest_a 7673.284425 \n", + "normaltest_b 0.0 \n", + "jarque_bera_a 92.4347 \n", + "jarque_bera_b 0.0 \n", + "shapiro_a 0.793814 \n", + "shapiro_b 0.0 \n", + "anderson_a 81.552961 \n", + "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", + "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", + "energy 1742 \n", + "rms 1.319848 \n", + "entropy 1.583147 \n", + "autocorrelation -0.008494 \n", + "\n", + " B \\\n", + "size 1000 \n", + "non-null 1000 \n", + "nunique 3 \n", + "sum 1026 \n", + "min 0 \n", + "max 2 \n", + "first 1 \n", + "last 2 \n", + "mean 1.026 \n", + "median 1.0 \n", + "mode 1 \n", + "gmean 0.0 \n", + "hmean 0.0 \n", + "Pmean 0.0 \n", + "geothmetic meandian 0.0 \n", + "variance 0.649324 \n", + "std 0.805806 \n", + "mad 1.0 \n", + "skewness -0.04714 \n", + "excesskurtosis 1.541503 \n", + "range 2 \n", + "Prange 0.0 \n", + "n_outliers 0 \n", + "P75 2.0 \n", + "P25 0.0 \n", + "P10 0.0 \n", + "P90 2.0 \n", + "PN NaN \n", + "skewtest_a -0.612722 \n", + "skewtest_b 0.54006 \n", + "kurtosistest_a 92.396965 \n", + "kurtosistest_b 0.0 \n", + "normaltest_a 8537.574579 \n", + "normaltest_b 0.0 \n", + "jarque_bera_a 89.004259 \n", + "jarque_bera_b 0.0 \n", + "shapiro_a 0.79688 \n", + "shapiro_b 0.0 \n", + "anderson_a 80.265394 \n", + "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", + "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", + "energy 1702 \n", + "rms 1.304607 \n", + "entropy 1.583318 \n", + "autocorrelation -0.001003 \n", + "\n", + " C \\\n", + "size 1000 \n", + "non-null 1000 \n", + "nunique 3 \n", + "sum 1002 \n", + "min 0 \n", + "max 2 \n", + "first 2 \n", + "last 0 \n", + "mean 1.002 \n", + "median 1.0 \n", + "mode 1 \n", + "gmean 0.0 \n", + "hmean 0.0 \n", + "Pmean 0.0 \n", + "geothmetic meandian 0.0 \n", + "variance 0.661996 \n", + "std 0.813631 \n", + "mad 1.0 \n", + "skewness -0.003661 \n", + "excesskurtosis 1.510592 \n", + "range 2 \n", + "Prange 0.0 \n", + "n_outliers 0 \n", + "P75 2.0 \n", + "P25 0.0 \n", + "P10 0.0 \n", + "P90 2.0 \n", + "PN NaN \n", + "skewtest_a -0.047614 \n", + "skewtest_b 0.962024 \n", + "kurtosistest_a 86.38827 \n", + "kurtosistest_b 0.0 \n", + "normaltest_a 7462.935452 \n", + "normaltest_b 0.0 \n", + "jarque_bera_a 92.432906 \n", + "jarque_bera_b 0.0 \n", + "shapiro_a 0.79431 \n", + "shapiro_b 0.0 \n", + "anderson_a 81.257078 \n", + "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", + "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", + "energy 1666 \n", + "rms 1.290736 \n", + "entropy 1.584888 \n", + "autocorrelation 0.001508 \n", + "\n", + " D \\\n", + "size 1000 \n", + "non-null 1000 \n", + "nunique 3 \n", + "sum 989 \n", + "min 0 \n", + "max 2 \n", + "first 1 \n", + "last 1 \n", + "mean 0.989 \n", + "median 1.0 \n", + "mode 1 \n", + "gmean 0.0 \n", + "hmean 0.0 \n", + "Pmean 0.0 \n", + "geothmetic meandian 0.0 \n", + "variance 0.634879 \n", + "std 0.796793 \n", + "mad 1.0 \n", + "skewness 0.019674 \n", + "excesskurtosis 1.575346 \n", + "range 2 \n", + "Prange 0.0 \n", + "n_outliers 0 \n", + "P75 2.0 \n", + "P25 0.0 \n", + "P10 0.0 \n", + "P90 2.0 \n", + "PN NaN \n", + "skewtest_a 0.255835 \n", + "skewtest_b 0.798078 \n", + "kurtosistest_a 103.150756 \n", + "kurtosistest_b 0.0 \n", + "normaltest_a 10640.143829 \n", + "normaltest_b 0.0 \n", + "jarque_bera_a 84.632762 \n", + "jarque_bera_b 0.0 \n", + "shapiro_a 0.80017 \n", + "shapiro_b 0.0 \n", + "anderson_a 79.026048 \n", + "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", + "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", + "energy 1613 \n", + "rms 1.270039 \n", + "entropy 1.581618 \n", + "autocorrelation -0.015942 \n", + "\n", + " E \n", + "size 1000 \n", + "non-null 1000 \n", + "nunique 3 \n", + "sum 1026 \n", + "min 0 \n", + "max 2 \n", + "first 1 \n", + "last 2 \n", + "mean 1.026 \n", + "median 1.0 \n", + "mode 1 \n", + "gmean 0.0 \n", + "hmean 0.0 \n", + "Pmean 0.0 \n", + "geothmetic meandian 0.0 \n", + "variance 0.649324 \n", + "std 0.805806 \n", + "mad 1.0 \n", + "skewness -0.04714 \n", + "excesskurtosis 1.541503 \n", + "range 2 \n", + "Prange 0.0 \n", + "n_outliers 0 \n", + "P75 2.0 \n", + "P25 0.0 \n", + "P10 0.0 \n", + "P90 2.0 \n", + "PN NaN \n", + "skewtest_a -0.612722 \n", + "skewtest_b 0.54006 \n", + "kurtosistest_a 92.396965 \n", + "kurtosistest_b 0.0 \n", + "normaltest_a 8537.574579 \n", + "normaltest_b 0.0 \n", + "jarque_bera_a 89.004259 \n", + "jarque_bera_b 0.0 \n", + "shapiro_a 0.79688 \n", + "shapiro_b 0.0 \n", + "anderson_a 80.265394 \n", + "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", + "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", + "energy 1702 \n", + "rms 1.304607 \n", + "entropy 1.583318 \n", + "autocorrelation -0.001003 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=\"all\", n_jobs=-1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
mean1.041.0261.0020.9891.026
median1.001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "mean 1.04 1.026 1.002 0.989 1.026\n", + "median 1.00 1.000 1.000 1.000 1.000" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=[\"mean\", \"median\"], n_jobs=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
somename1.041.0261.0020.9891.026
median1.001.0001.0001.0001.000
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "somename 1.04 1.026 1.002 0.989 1.026\n", + "median 1.00 1.000 1.000 1.000 1.000" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=[{'func':\"mean\", 'name':\"somename\"}, \"median\"], n_jobs=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
func_01.041.0261.0020.9891.026
mean1.041.0261.0020.9891.026
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "func_0 1.04 1.026 1.002 0.989 1.026\n", + "mean 1.04 1.026 1.002 0.989 1.026" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=[np.mean, \"mean\"], n_jobs=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
skewtest_a-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_b0.3414060.5400600.9620240.7980780.540060
mean1.0400001.0260001.0020000.9890001.026000
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "skewtest_a -0.951391 -0.612722 -0.047614 0.255835 -0.612722\n", + "skewtest_b 0.341406 0.540060 0.962024 0.798078 0.540060\n", + "mean 1.040000 1.026000 1.002000 0.989000 1.026000" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=[\"skewtest\", \"mean\"], n_jobs=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDE
skewtest_stat-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_p0.3414060.5400600.9620240.7980780.540060
mean1.0400001.0260001.0020000.9890001.026000
\n", + "
" + ], + "text/plain": [ + " A B C D E\n", + "skewtest_stat -0.951391 -0.612722 -0.047614 0.255835 -0.612722\n", + "skewtest_p 0.341406 0.540060 0.962024 0.798078 0.540060\n", + "mean 1.040000 1.026000 1.002000 0.989000 1.026000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "univariate.build_univariate_statistics(df, agg=[{'func':\"skewtest\", \"ret_names\":[\"stat\", \"p\"]}, \"mean\"], n_jobs=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_microwave (3.13.2)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2418f21 Binary files /dev/null and b/requirements.txt differ