Files
RamanClassifier/data/data_processing.ipynb

143 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-03-12T10:19:14.020553600Z",
"start_time": "2024-03-12T10:19:13.165301100Z"
}
},
"outputs": [],
"source": [
"from data import load_raw_data, data_to_single_df\n",
"from sklearn.model_selection import train_test_split\n",
"import os"
]
},
{
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": " strain replica phase objective substrate laser_power[%] grating \\\n0 A390 1.0 liquid 50 alu 100 1800 \n1 A390 1.0 liquid 50 alu 100 1800 \n2 A390 1.0 liquid 50 alu 100 1800 \n3 A390 1.0 liquid 50 alu 100 1800 \n4 A390 1.0 liquid 50 alu 100 1800 \n.. ... ... ... ... ... ... ... \n700 F113 2.0 solid 50 alu 100 1800 \n701 F113 2.0 solid 50 alu 100 1800 \n702 F113 2.0 solid 50 alu 100 1800 \n703 F113 2.0 solid 50 alu 100 1800 \n704 F113 2.0 solid 50 alu 100 1800 \n\n exposition[sec] confocalhigh accumulations \n0 20 True 2 \n1 20 True 2 \n2 20 True 2 \n3 20 True 2 \n4 20 True 2 \n.. ... ... ... \n700 20 True 2 \n701 20 True 2 \n702 20 True 2 \n703 20 True 2 \n704 20 True 2 \n\n[705 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>strain</th>\n <th>replica</th>\n <th>phase</th>\n <th>objective</th>\n <th>substrate</th>\n <th>laser_power[%]</th>\n <th>grating</th>\n <th>exposition[sec]</th>\n <th>confocalhigh</th>\n <th>accumulations</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>A390</td>\n <td>1.0</td>\n <td>liquid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>1</th>\n <td>A390</td>\n <td>1.0</td>\n <td>liquid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>A390</td>\n <td>1.0</td>\n <td>liquid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>3</th>\n <td>A390</td>\n <td>1.0</td>\n <td>liquid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>4</th>\n <td>A390</td>\n <td>1.0</td>\n <td>liquid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>700</th>\n <td>F113</td>\n <td>2.0</td>\n <td>solid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>701</th>\n <td>F113</td>\n <td>2.0</td>\n <td>solid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>702</th>\n <td>F113</td>\n <td>2.0</td>\n <td>solid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>703</th>\n <td>F113</td>\n <td>2.0</td>\n <td>solid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n <tr>\n <th>704</th>\n <td>F113</td>\n <td>2.0</td>\n <td>solid</td>\n <td>50</td>\n <td>alu</td>\n <td>100</td>\n <td>1800</td>\n <td>20</td>\n <td>True</td>\n <td>2</td>\n </tr>\n </tbody>\n</table>\n<p>705 rows × 10 columns</p>\n</div>"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_metadata, sliced_experiments = load_raw_data()\n",
"df_metadata = df_metadata.reset_index(drop=True)\n",
"sliced_experiments = data_to_single_df(sliced_experiments)\n",
"df_metadata"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-12T10:19:15.770433700Z",
"start_time": "2024-03-12T10:19:14.494301500Z"
}
},
"id": "8ad20a80c1220f67",
"execution_count": 2
},
{
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": " 0 1 2 3 4 \\\n0 1927.490234 1925.943359 1924.396484 1922.849609 1921.300781 \n1 1927.490234 1925.943359 1924.396484 1922.849609 1921.300781 \n2 1927.490234 1925.943359 1924.396484 1922.849609 1921.300781 \n3 1927.490234 1925.943359 1924.396484 1922.849609 1921.300781 \n4 1927.490234 1925.943359 1924.396484 1922.849609 1921.300781 \n.. ... ... ... ... ... \n700 1927.423828 1925.876953 1924.330078 1922.783203 1921.234375 \n701 1927.423828 1925.876953 1924.330078 1922.783203 1921.234375 \n702 1927.423828 1925.876953 1924.330078 1922.783203 1921.234375 \n703 1927.423828 1925.876953 1924.330078 1922.783203 1921.234375 \n704 1927.423828 1925.876953 1924.330078 1922.783203 1921.234375 \n\n 5 6 7 8 9 ... \\\n0 1919.753906 1918.205078 1916.656250 1915.107422 1913.558594 ... \n1 1919.753906 1918.205078 1916.656250 1915.107422 1913.558594 ... \n2 1919.753906 1918.205078 1916.656250 1915.107422 1913.558594 ... \n3 1919.753906 1918.205078 1916.656250 1915.107422 1913.558594 ... \n4 1919.753906 1918.205078 1916.656250 1915.107422 1913.558594 ... \n.. ... ... ... ... ... ... \n700 1919.687500 1918.138672 1916.589844 1915.041016 1913.490234 ... \n701 1919.687500 1918.138672 1916.589844 1915.041016 1913.490234 ... \n702 1919.687500 1918.138672 1916.589844 1915.041016 1913.490234 ... \n703 1919.687500 1918.138672 1916.589844 1915.041016 1913.490234 ... \n704 1919.687500 1918.138672 1916.589844 1915.041016 1913.490234 ... \n\n 2012 2013 2014 2015 2016 \\\n0 14854.949219 14755.791992 15042.042969 14817.480469 15048.003906 \n1 11034.476563 11166.440430 11212.995117 11198.762695 11132.833984 \n2 14067.049805 13951.259766 14187.263672 14401.179688 14406.893555 \n3 12235.085938 12234.838867 12411.754883 12335.187500 12559.822266 \n4 10723.974609 10647.762695 10770.733398 10733.333008 10856.226563 \n.. ... ... ... ... ... \n700 4422.035645 4389.968262 4558.354492 4537.906250 4548.488770 \n701 4107.654297 4176.549316 4240.239258 4285.800293 4296.440918 \n702 3944.642090 3987.705566 3986.781982 3965.173340 4027.590332 \n703 4010.623291 3954.075928 4013.938232 4073.772705 4087.047607 \n704 4136.116699 4043.323975 4178.168457 4170.736328 4165.893555 \n\n 2017 2018 2019 2020 2021 \n0 14986.369141 14955.769531 15187.391602 15270.399414 15189.403320 \n1 11327.973633 11351.191406 11524.231445 11493.155273 11546.012695 \n2 14457.833984 14530.713867 14599.684570 14753.853516 14849.852539 \n3 12643.497070 12804.650391 12725.479492 12832.300781 12975.223633 \n4 11025.583984 11052.748047 11126.398438 11140.612305 11225.828125 \n.. ... ... ... ... ... \n700 4538.390137 4565.762695 4717.121094 4713.447266 4722.685547 \n701 4418.210449 4392.641113 4470.415527 4509.413086 4470.929688 \n702 4017.612305 4123.914551 4246.958984 4199.487793 4242.412598 \n703 4157.175293 4198.847656 4170.751465 4209.818848 4285.017090 \n704 4211.450195 4167.840820 4290.875488 4482.294922 4344.405762 \n\n[705 rows x 2022 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>...</th>\n <th>2012</th>\n <th>2013</th>\n <th>2014</th>\n <th>2015</th>\n <th>2016</th>\n <th>2017</th>\n <th>2018</th>\n <th>2019</th>\n <th>2020</th>\n <th>2021</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1927.490234</td>\n <td>1925.943359</td>\n <td>1924.396484</td>\n <td>1922.849609</td>\n <td>1921.300781</td>\n <td>1919.753906</td>\n <td>1918.205078</td>\n <td>1916.656250</td>\n <td>1915.107422</td>\n <td>1913.558594</td>\n <td>...</td>\n <td>14854.949219</td>\n <td>14755.791992</td>\n <td>15042.042969</td>\n <td>14817.480469</td>\n <td>15048.003906</td>\n <td>14986.369141</td>\n <td>14955.769531</td>\n <td>15187.391602</td>\n <td>15270.399414</td>\n <td>15189.403320</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1927.490234</td>\n <td>1925.943359</td>\n <td>1924.396484</td>\n <td>1922.849609</td>\n <td>1921.300781</td>\n <td>1919.753906</td>\n <td>1918.205078</td>\n <td>1916.656250</td>\n <td>1915.107422</td>\n <td>1913.558594</td>\n <td>...</td>\n <td>11034.476563</td>\n <td>11166.440430</td>\n <td>11212.995117</td>\n <td>11198.762695</td>\n <td>11132.833984</td>\n <td>11327.973633</td>\n <td>11351.191406</td>\n <td>11524.231445</td>\n <td>11493.155273</td>\n <td>11546.012695</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1927.490234</td>\n <td>1925.943359</td>\n <td>1924.396484</td>\n <td>1922.849609</td>\n <td>1921.300781</td>\n <td>1919.753906</td>\n <td>1918.205078</td>\n <td>1916.656250</td>\n <td>1915.107422</td>\n <td>1913.558594</td>\n <td>...</td>\n <td>14067.049805</td>\n <td>13951.259766</td>\n <td>14187.263672</td>\n <td>14401.179688</td>\n <td>14406.893555</td>\n <td>14457.833984</td>\n <td>14530.713867</td>\n <td>14599.684570</td>\n <td>14753.853516</td>\n <td>14849.852539</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1927.490234</td>\n <td>1925.943359</td>\n <td>1924.396484</td>\n <td>1922.849609</td>\n <td>1921.300781</td>\n <td>1919.753906</td>\n <td>1918.205078</td>\n <td>1916.656250</td>\n <td>1915.107422</td>\n <td>1913.558594</td>\n <td>...</td>\n <td>12235.085938</td>\n <td>12234.838867</td>\n <td>12411.754883</td>\n <td>12335.187500</td>\n <td>12559.822266</td>\n <td>12643.497070</td>\n <td>12804.650391</td>\n <td>12725.479492</td>\n <td>12832.300781</td>\n <td>12975.223633</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1927.490234</td>\n <td>1925.943359</td>\n <td>1924.396484</td>\n <td>1922.849609</td>\n <td>1921.300781</td>\n <td>1919.753906</td>\n <td>1918.205078</td>\n <td>1916.656250</td>\n <td>1915.107422</td>\n <td>1913.558594</td>\n <td>...</td>\n <td>10723.974609</td>\n <td>10647.762695</td>\n <td>10770.733398</td>\n <td>10733.333008</td>\n <td>10856.226563</td>\n <td>11025.583984</td>\n <td>11052.748047</td>\n <td>11126.398438</td>\n <td>11140.612305</td>\n <td>11225.828125</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>700</th>\n <td>1927.423828</td>\n <td>1925.876953</td>\n <td>1924.330078</td>\n <td>1922.783203</td>\n <td>1921.234375</td>\n <td>1919.687500</td>\n <td>1918.138672</td>\n <td>1916.589844</td>\n <td>1915.041016</td>\n <td>1913.490234</td>\n <td>...</td>\n <td>4422.035645</td>\n <td>4389.968262</td>\n <td>4558.354492</td>\n <td>4537.906250</td>\n <td>4548.488770</td>\n <td>4538.390137</td>\n <td>4565.762695</td>\n <td>4717.121094</td>\n <td>4713.447266</td>\n <td>4722.685547</td>\n </tr>\n <tr>\n <th>701</th>\n <td>1927.423828</td>\n <td>1925.876953</td>\n <td>1924.330078</td>\n <td>1922.783203</td>\n <td>1921.234375</td>\n <td>1919.687500</td>\n <td>1918.138672</td>\n <td>1916.589844</td>\n <td>1915.041016</td>\n <td>1913.490234</td>\n <td>...</td>\n <td>4107.654297</td>\n <td>4176.549316</td>\n <td>4240.239258</td>\n <td>4285.800293</td>\n <td>4296.440918</td>\n <td>4418.210449</td>\n <td>4392.641113</td>\n <td>4470.415527</td>\n <td>4509.413086</td>\n <td>4470.929688</td>\n </tr>\n <tr>\n <th>702</th>\n <td>1927.423828</td>\n <td>1925.876953</td>\n <td>1924.330078</td>\n <td>1922.783203</td>\n <td>1921.234375</td>\n <td>1919.687500</td>\n <td>1918.138672</td>\n <td>1916.589844</td>\n <td>1915.041016</td>\n <td>1913.490234</td>\n <td>...</td>\n <td>3944.642090</td>\n <td>3987.705566</td>\n <td>3986.781982</td>\n <td>3965.173340</td>\n <td>4027.590332</td>\n <td>4017.612305</td>\n <td>4123.914551</td>\n <td>4246.958984</td>\n <td>4199.487793</td>\n <td>4242.412598</td>\n </tr>\n <tr>\n <th>703</th>\n <td>1927.423828</td>\n <td>1925.876953</td>\n <td>1924.330078</td>\n <td>1922.783203</td>\n <td>1921.234375</td>\n <td>1919.687500</td>\n <td>1918.138672</td>\n <td>1916.589844</td>\n <td>1915.041016</td>\n <td>1913.490234</td>\n <td>...</td>\n <td>4010.623291</td>\n <td>3954.075928</td>\n <td>4013.938232</td>\n <td>4073.772705</td>\n <td>4087.047607</td>\n <td>4157.175293</td>\n <td>4198.847656</td>\n <td>4170.751465</td>\n <td>4209.818848</td>\n <td>4285.017090</td>\n </tr>\n <tr>\n <th>704</th>\n <td>1927.423828</td>\n <td>1925.876953</td>\n <td>1924.330078</td>\n <td>1922.783203</td>\n <td>1921.234375</td>\n <td>1919.687500</td>\n <td>1918.138672</td>\n <td>1916.589844</td>\n <td>1915.041016</td>\n <td>1913.490234</td>\n <td>...</td>\n <td>4136.116699</td>\n <td>4043.323975</td>\n <td>4178.168457</td>\n <td>4170.736328</td>\n <td>4165.893555</td>\n <td>4211.450195</td>\n <td>4167.840820</td>\n <td>4290.875488</td>\n <td>4482.294922</td>\n <td>4344.405762</td>\n </tr>\n </tbody>\n</table>\n<p>705 rows × 2022 columns</p>\n</div>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sliced_experiments"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-12T10:19:17.025682700Z",
"start_time": "2024-03-12T10:19:16.960790600Z"
}
},
"id": "7efb37d498e3048a",
"execution_count": 3
},
{
"cell_type": "code",
"outputs": [],
"source": [
"sliced_experiments_train, sliced_experiments_test, df_metadata_train, df_metadata_test = train_test_split(sliced_experiments, df_metadata, test_size=0.25, random_state=42, stratify=df_metadata[\"strain\"])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-12T10:19:26.528373600Z",
"start_time": "2024-03-12T10:19:26.499993100Z"
}
},
"id": "1a9d17e12743f2bc",
"execution_count": 4
},
{
"cell_type": "code",
"outputs": [],
"source": [
"sliced_experiments_train.to_csv(os.path.join(\"train\", \"experiments.csv\"), index=False)\n",
"sliced_experiments_test.to_csv(os.path.join(\"test\", \"experiments.csv\"), index=False)\n",
"df_metadata_train.to_csv(os.path.join(\"train\", \"metadata.csv\"), index=False)\n",
"df_metadata_test.to_csv(os.path.join(\"test\", \"metadata.csv\"), index=False)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2024-03-12T10:19:28.144011700Z",
"start_time": "2024-03-12T10:19:27.344164Z"
}
},
"id": "824046363b5a355b",
"execution_count": 5
},
{
"cell_type": "code",
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "3fdc9d0125e836c5"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}