Files
microwave/notebooks/demo_processing.ipynb
2025-02-27 13:46:56 +01:00

726 lines
21 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>b</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0\n",
"0 0 2 2 2 b\n",
"1 1 2 1 1 c\n",
"2 1 0 1 1 d\n",
"3 0 0 1 0 d\n",
"4 2 1 2 2 c\n",
"5 0 0 0 0 d\n",
"6 0 2 2 2 a\n",
"7 0 2 0 0 a\n",
"8 0 1 0 0 a\n",
"9 0 2 2 1 b\n",
"10 2 2 0 1 c\n",
"11 2 1 1 1 a\n",
"12 0 1 0 2 a\n",
"13 2 1 0 1 c\n",
"14 1 0 0 1 b"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randint(0,3,size=(15, 4)), columns=list('ABCD'))\n",
"df = pd.concat([df, pd.DataFrame(np.random.choice(list(string.ascii_letters)[:4], size=15, replace=True))], axis=1)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"c:\\Users\\Edouard\\Documents\\Git\\microwave\n"
]
}
],
"source": [
"%cd ..\n",
"import microwave.data_processing as dp"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0_encoded_a</th>\n",
" <th>0_encoded_b</th>\n",
" <th>0_encoded_c</th>\n",
" <th>0_encoded_d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0_encoded_a 0_encoded_b 0_encoded_c 0_encoded_d\n",
"0 0.0 2.0 2.0 2.0 0.0 1.0 0.0 0.0\n",
"1 1.0 2.0 1.0 1.0 0.0 0.0 1.0 0.0\n",
"2 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0\n",
"3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0\n",
"4 2.0 1.0 2.0 2.0 0.0 0.0 1.0 0.0\n",
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0\n",
"6 0.0 2.0 2.0 2.0 1.0 0.0 0.0 0.0\n",
"7 0.0 2.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"8 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0\n",
"9 0.0 2.0 2.0 1.0 0.0 1.0 0.0 0.0\n",
"10 2.0 2.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
"11 2.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0\n",
"12 0.0 1.0 0.0 2.0 1.0 0.0 0.0 0.0\n",
"13 2.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0\n",
"14 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df, trans = dp.df_to_numeric(df)\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(0,\n",
" OneHotEncoder(sparse_output=False),\n",
" ['0_encoded_a', '0_encoded_b', '0_encoded_c', '0_encoded_d']),\n",
" ('A',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('B',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('C',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('D',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_a',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_b',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_c',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column']),\n",
" ('0_encoded_d',\n",
" FunctionTransformer(func=<function df_to_numeric.<locals>.<lambda> at 0x000001B4F3F920C0>),\n",
" ['column'])]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trans.transforms"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Edouard\\Documents\\Git\\microwave\\.venv_microwave\\Lib\\site-packages\\sklearn\\preprocessing\\_label.py:110: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>A</th>\n",
" <th>B</th>\n",
" <th>C</th>\n",
" <th>D</th>\n",
" <th>0_encoded</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" A B C D 0_encoded\n",
"0 0.0 2.0 2.0 2.0 1.0\n",
"1 1.0 2.0 1.0 1.0 2.0\n",
"2 1.0 0.0 1.0 1.0 3.0\n",
"3 0.0 0.0 1.0 0.0 3.0\n",
"4 2.0 1.0 2.0 2.0 2.0\n",
"5 0.0 0.0 0.0 0.0 3.0\n",
"6 0.0 2.0 2.0 2.0 0.0\n",
"7 0.0 2.0 0.0 0.0 0.0\n",
"8 0.0 1.0 0.0 0.0 0.0\n",
"9 0.0 2.0 2.0 1.0 1.0\n",
"10 2.0 2.0 0.0 1.0 2.0\n",
"11 2.0 1.0 1.0 1.0 0.0\n",
"12 0.0 1.0 0.0 2.0 0.0\n",
"13 2.0 1.0 0.0 1.0 2.0\n",
"14 1.0 0.0 0.0 1.0 1.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df, trans = dp.df_to_numeric(df, encoding=\"label\")\n",
"new_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv_microwave (3.13.2)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}