{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "c:\\Users\\Edouard\\Documents\\Git\\microwave\n" ] } ], "source": [ "%cd ..\n", "import microwave.data_analysis.univariate as univariate\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['size', 'non-null', 'nunique', 'sum', 'min', 'max', 'first', 'last', 'mean', 'median', 'mode', 'gmean', 'hmean', 'Pmean', 'geothmetic meandian', 'variance', 'std', 'mad', 'skewness', 'excesskurtosis', 'range', 'Prange', 'n_outliers', 'P75', 'P25', 'P10', 'P90', 'PN', 'skewtest', 'kurtosistest', 'normaltest', 'jarque_bera', 'shapiro', 'anderson', 'energy', 'rms', 'entropy', 'autocorrelation'])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.AGGFUNCCODES.keys()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
001211
111121
202022
320210
422122
..................
99501221
99612112
99710010
99820110
99922012
\n", "

1000 rows × 5 columns

\n", "
" ], "text/plain": [ " A B C D E\n", "0 0 1 2 1 1\n", "1 1 1 1 2 1\n", "2 0 2 0 2 2\n", "3 2 0 2 1 0\n", "4 2 2 1 2 2\n", ".. .. .. .. .. ..\n", "995 0 1 2 2 1\n", "996 1 2 1 1 2\n", "997 1 0 0 1 0\n", "998 2 0 1 1 0\n", "999 2 2 0 1 2\n", "\n", "[1000 rows x 5 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randint(0,3,size=(1000, 4)), columns=list('ABCD'))\n", "df['E'] = df['B']\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
size10001000100010001000
non-null10001000100010001000
nunique33333
sum1040102610029891026
min00000
max22222
first01211
last22012
mean1.041.0261.0020.9891.026
median1.01.01.01.01.0
mode21111
gmean0.00.00.00.00.0
hmean0.00.00.00.00.0
Pmean0.00.00.00.00.0
geothmetic meandian0.00.00.00.00.0
variance0.66040.6493240.6619960.6348790.649324
std0.812650.8058060.8136310.7967930.805806
mad1.01.01.01.01.0
skewness-0.073251-0.04714-0.0036610.019674-0.04714
excesskurtosis1.5177821.5415031.5105921.5753461.541503
range22222
Prange0.00.00.00.00.0
n_outliers00000
P752.02.02.02.02.0
P250.00.00.00.00.0
P100.00.00.00.00.0
P902.02.02.02.02.0
PNNaNNaNNaNNaNNaN
skewtest_a-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_b0.3414060.540060.9620240.7980780.54006
kurtosistest_a87.59211992.39696586.38827103.15075692.396965
kurtosistest_b0.00.00.00.00.0
normaltest_a7673.2844258537.5745797462.93545210640.1438298537.574579
normaltest_b0.00.00.00.00.0
jarque_bera_a92.434789.00425992.43290684.63276289.004259
jarque_bera_b0.00.00.00.00.0
shapiro_a0.7938140.796880.794310.800170.79688
shapiro_b0.00.00.00.00.0
anderson_a81.55296180.26539481.25707879.02604880.265394
anderson_b[0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088][0.574, 0.653, 0.784, 0.914, 1.088]
anderson_c[15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0][15.0, 10.0, 5.0, 2.5, 1.0]
energy17421702166616131702
rms1.3198481.3046071.2907361.2700391.304607
entropy1.5831471.5833181.5848881.5816181.583318
autocorrelation-0.008494-0.0010030.001508-0.015942-0.001003
\n", "
" ], "text/plain": [ " A \\\n", "size 1000 \n", "non-null 1000 \n", "nunique 3 \n", "sum 1040 \n", "min 0 \n", "max 2 \n", "first 0 \n", "last 2 \n", "mean 1.04 \n", "median 1.0 \n", "mode 2 \n", "gmean 0.0 \n", "hmean 0.0 \n", "Pmean 0.0 \n", "geothmetic meandian 0.0 \n", "variance 0.6604 \n", "std 0.81265 \n", "mad 1.0 \n", "skewness -0.073251 \n", "excesskurtosis 1.517782 \n", "range 2 \n", "Prange 0.0 \n", "n_outliers 0 \n", "P75 2.0 \n", "P25 0.0 \n", "P10 0.0 \n", "P90 2.0 \n", "PN NaN \n", "skewtest_a -0.951391 \n", "skewtest_b 0.341406 \n", "kurtosistest_a 87.592119 \n", "kurtosistest_b 0.0 \n", "normaltest_a 7673.284425 \n", "normaltest_b 0.0 \n", "jarque_bera_a 92.4347 \n", "jarque_bera_b 0.0 \n", "shapiro_a 0.793814 \n", "shapiro_b 0.0 \n", "anderson_a 81.552961 \n", "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", "energy 1742 \n", "rms 1.319848 \n", "entropy 1.583147 \n", "autocorrelation -0.008494 \n", "\n", " B \\\n", "size 1000 \n", "non-null 1000 \n", "nunique 3 \n", "sum 1026 \n", "min 0 \n", "max 2 \n", "first 1 \n", "last 2 \n", "mean 1.026 \n", "median 1.0 \n", "mode 1 \n", "gmean 0.0 \n", "hmean 0.0 \n", "Pmean 0.0 \n", "geothmetic meandian 0.0 \n", "variance 0.649324 \n", "std 0.805806 \n", "mad 1.0 \n", "skewness -0.04714 \n", "excesskurtosis 1.541503 \n", "range 2 \n", "Prange 0.0 \n", "n_outliers 0 \n", "P75 2.0 \n", "P25 0.0 \n", "P10 0.0 \n", "P90 2.0 \n", "PN NaN \n", "skewtest_a -0.612722 \n", "skewtest_b 0.54006 \n", "kurtosistest_a 92.396965 \n", "kurtosistest_b 0.0 \n", "normaltest_a 8537.574579 \n", "normaltest_b 0.0 \n", "jarque_bera_a 89.004259 \n", "jarque_bera_b 0.0 \n", "shapiro_a 0.79688 \n", "shapiro_b 0.0 \n", "anderson_a 80.265394 \n", "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", "energy 1702 \n", "rms 1.304607 \n", "entropy 1.583318 \n", "autocorrelation -0.001003 \n", "\n", " C \\\n", "size 1000 \n", "non-null 1000 \n", "nunique 3 \n", "sum 1002 \n", "min 0 \n", "max 2 \n", "first 2 \n", "last 0 \n", "mean 1.002 \n", "median 1.0 \n", "mode 1 \n", "gmean 0.0 \n", "hmean 0.0 \n", "Pmean 0.0 \n", "geothmetic meandian 0.0 \n", "variance 0.661996 \n", "std 0.813631 \n", "mad 1.0 \n", "skewness -0.003661 \n", "excesskurtosis 1.510592 \n", "range 2 \n", "Prange 0.0 \n", "n_outliers 0 \n", "P75 2.0 \n", "P25 0.0 \n", "P10 0.0 \n", "P90 2.0 \n", "PN NaN \n", "skewtest_a -0.047614 \n", "skewtest_b 0.962024 \n", "kurtosistest_a 86.38827 \n", "kurtosistest_b 0.0 \n", "normaltest_a 7462.935452 \n", "normaltest_b 0.0 \n", "jarque_bera_a 92.432906 \n", "jarque_bera_b 0.0 \n", "shapiro_a 0.79431 \n", "shapiro_b 0.0 \n", "anderson_a 81.257078 \n", "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", "energy 1666 \n", "rms 1.290736 \n", "entropy 1.584888 \n", "autocorrelation 0.001508 \n", "\n", " D \\\n", "size 1000 \n", "non-null 1000 \n", "nunique 3 \n", "sum 989 \n", "min 0 \n", "max 2 \n", "first 1 \n", "last 1 \n", "mean 0.989 \n", "median 1.0 \n", "mode 1 \n", "gmean 0.0 \n", "hmean 0.0 \n", "Pmean 0.0 \n", "geothmetic meandian 0.0 \n", "variance 0.634879 \n", "std 0.796793 \n", "mad 1.0 \n", "skewness 0.019674 \n", "excesskurtosis 1.575346 \n", "range 2 \n", "Prange 0.0 \n", "n_outliers 0 \n", "P75 2.0 \n", "P25 0.0 \n", "P10 0.0 \n", "P90 2.0 \n", "PN NaN \n", "skewtest_a 0.255835 \n", "skewtest_b 0.798078 \n", "kurtosistest_a 103.150756 \n", "kurtosistest_b 0.0 \n", "normaltest_a 10640.143829 \n", "normaltest_b 0.0 \n", "jarque_bera_a 84.632762 \n", "jarque_bera_b 0.0 \n", "shapiro_a 0.80017 \n", "shapiro_b 0.0 \n", "anderson_a 79.026048 \n", "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", "energy 1613 \n", "rms 1.270039 \n", "entropy 1.581618 \n", "autocorrelation -0.015942 \n", "\n", " E \n", "size 1000 \n", "non-null 1000 \n", "nunique 3 \n", "sum 1026 \n", "min 0 \n", "max 2 \n", "first 1 \n", "last 2 \n", "mean 1.026 \n", "median 1.0 \n", "mode 1 \n", "gmean 0.0 \n", "hmean 0.0 \n", "Pmean 0.0 \n", "geothmetic meandian 0.0 \n", "variance 0.649324 \n", "std 0.805806 \n", "mad 1.0 \n", "skewness -0.04714 \n", "excesskurtosis 1.541503 \n", "range 2 \n", "Prange 0.0 \n", "n_outliers 0 \n", "P75 2.0 \n", "P25 0.0 \n", "P10 0.0 \n", "P90 2.0 \n", "PN NaN \n", "skewtest_a -0.612722 \n", "skewtest_b 0.54006 \n", "kurtosistest_a 92.396965 \n", "kurtosistest_b 0.0 \n", "normaltest_a 8537.574579 \n", "normaltest_b 0.0 \n", "jarque_bera_a 89.004259 \n", "jarque_bera_b 0.0 \n", "shapiro_a 0.79688 \n", "shapiro_b 0.0 \n", "anderson_a 80.265394 \n", "anderson_b [0.574, 0.653, 0.784, 0.914, 1.088] \n", "anderson_c [15.0, 10.0, 5.0, 2.5, 1.0] \n", "energy 1702 \n", "rms 1.304607 \n", "entropy 1.583318 \n", "autocorrelation -0.001003 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=\"all\", n_jobs=-1).T" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
mean1.041.0261.0020.9891.026
median1.001.0001.0001.0001.000
\n", "
" ], "text/plain": [ " A B C D E\n", "mean 1.04 1.026 1.002 0.989 1.026\n", "median 1.00 1.000 1.000 1.000 1.000" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=[\"mean\", \"median\"], n_jobs=1).T" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
somename1.041.0261.0020.9891.026
median1.001.0001.0001.0001.000
\n", "
" ], "text/plain": [ " A B C D E\n", "somename 1.04 1.026 1.002 0.989 1.026\n", "median 1.00 1.000 1.000 1.000 1.000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=[{'func':\"mean\", 'name':\"somename\"}, \"median\"], n_jobs=1).T" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
func_01.041.0261.0020.9891.026
mean1.041.0261.0020.9891.026
\n", "
" ], "text/plain": [ " A B C D E\n", "func_0 1.04 1.026 1.002 0.989 1.026\n", "mean 1.04 1.026 1.002 0.989 1.026" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=[np.mean, \"mean\"], n_jobs=1).T" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
skewtest_a-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_b0.3414060.5400600.9620240.7980780.540060
mean1.0400001.0260001.0020000.9890001.026000
\n", "
" ], "text/plain": [ " A B C D E\n", "skewtest_a -0.951391 -0.612722 -0.047614 0.255835 -0.612722\n", "skewtest_b 0.341406 0.540060 0.962024 0.798078 0.540060\n", "mean 1.040000 1.026000 1.002000 0.989000 1.026000" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=[\"skewtest\", \"mean\"], n_jobs=1).T" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDE
skewtest_stat-0.951391-0.612722-0.0476140.255835-0.612722
skewtest_p0.3414060.5400600.9620240.7980780.540060
mean1.0400001.0260001.0020000.9890001.026000
\n", "
" ], "text/plain": [ " A B C D E\n", "skewtest_stat -0.951391 -0.612722 -0.047614 0.255835 -0.612722\n", "skewtest_p 0.341406 0.540060 0.962024 0.798078 0.540060\n", "mean 1.040000 1.026000 1.002000 0.989000 1.026000" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "univariate.build_univariate_statistics(df, agg=[{'func':\"skewtest\", \"ret_names\":[\"stat\", \"p\"]}, \"mean\"], n_jobs=1).T" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv_microwave (3.13.2)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 2 }