Better random forests
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,7 @@
|
|||||||
data_very_raw/
|
data_very_raw/
|
||||||
zzz_raman_spectroscopy-main/
|
zzz_raman_spectroscopy-main/
|
||||||
|
presentations/
|
||||||
|
to_ignore/
|
||||||
|
|
||||||
# ---> JupyterNotebooks
|
# ---> JupyterNotebooks
|
||||||
# gitignore template for Jupyter Notebooks
|
# gitignore template for Jupyter Notebooks
|
||||||
|
|||||||
1
classifiers/__init__.py
Normal file
1
classifiers/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from classifiers.evaluation import *
|
||||||
35
classifiers/evaluation.py
Normal file
35
classifiers/evaluation.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from data.data_processing import process_train_test
|
||||||
|
from sklearn.model_selection import cross_validate, ParameterGrid
|
||||||
|
|
||||||
|
|
||||||
|
def crossvalidate_params(classifier, params, experiments_train, metadata_train, y_train, cv=5):
|
||||||
|
process_params = {key: params[key] for key in ['baseline_lam', 'baseline_p', 'smooth_window_length', 'smooth_polyorder']}
|
||||||
|
classifier_params = {key: params[key] for key in params.keys() if key not in ['baseline_lam', 'baseline_p', 'smooth_window_length', 'smooth_polyorder']}
|
||||||
|
X_train, _ = process_train_test(process_params, experiments_train, metadata_train, scale=True)
|
||||||
|
clf = classifier(**classifier_params)
|
||||||
|
return cross_validate(clf, X_train, y_train.to_numpy().ravel(), cv=cv, return_estimator=True)
|
||||||
|
|
||||||
|
|
||||||
|
def param_grid_search(classifier, param_grid, experiments_train, metadata_train, y_train, cv=5):
|
||||||
|
results = []
|
||||||
|
for params in ParameterGrid(param_grid):
|
||||||
|
try:
|
||||||
|
results.append([params, crossvalidate_params(classifier, params, experiments_train, metadata_train, y_train, cv=cv)])
|
||||||
|
print(results[-1])
|
||||||
|
except Exception as e:
|
||||||
|
pass # print(params, e)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_classifier_params(classifier, params, X_train, y_train, X_test, y_test, iters=10):
|
||||||
|
train_score_mean = 0
|
||||||
|
test_score_mean = 0
|
||||||
|
for i in range(iters):
|
||||||
|
clf = classifier(**params)
|
||||||
|
clf.fit(X_train, y_train.to_numpy().ravel())
|
||||||
|
train_score_mean += clf.score(X_train, y_train.to_numpy().ravel())
|
||||||
|
test_score_mean += clf.score(X_test, y_test.to_numpy().ravel())
|
||||||
|
return train_score_mean / iters, test_score_mean / iters
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -57,5 +57,5 @@ def load_raw_data() -> tuple[pd.DataFrame, list[pd.DataFrame]]:
|
|||||||
|
|
||||||
def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
|
def load_data(name: str, path: os.path = os.path.join("data")) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
|
metadata = pd.read_csv(os.path.join(path, name, "metadata.csv"))
|
||||||
experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"))
|
experiments = pd.read_csv(os.path.join(path, name, "experiments.csv"), dtype=float)
|
||||||
return metadata, experiments
|
return metadata, experiments
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -80,9 +80,26 @@ def categorize_metadata(metadata: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFr
|
|||||||
return truth, encoded
|
return truth, encoded
|
||||||
|
|
||||||
|
|
||||||
def process_experiments(experiments: pd.DataFrame, baseline_lam=10, baseline_p=1e-2,
|
def process_experiments(experiments: pd.DataFrame, baseline_lam: int = 10, baseline_p: float = 1e-2,
|
||||||
smooth_window_length=7, smooth_polyorder=3) -> pd.DataFrame:
|
smooth_window_length: int = 7, smooth_polyorder: int = 3) -> pd.DataFrame:
|
||||||
experiments = adjust_all_baselines(experiments, lam=baseline_lam, p=baseline_p)
|
experiments = adjust_all_baselines(experiments, lam=baseline_lam, p=baseline_p)
|
||||||
experiments = scale_experiments(experiments)
|
experiments = scale_experiments(experiments)
|
||||||
experiments = smooth_experiments(experiments, window_length=smooth_window_length, polyorder=smooth_polyorder)
|
experiments = smooth_experiments(experiments, window_length=smooth_window_length, polyorder=smooth_polyorder)
|
||||||
return experiments
|
return experiments
|
||||||
|
|
||||||
|
|
||||||
|
def process_train_test(params: dict, experiments_train: pd.DataFrame, metadata_train: pd.DataFrame, experiments_test: pd.DataFrame = None, metadata_test: pd.DataFrame = None, scale: bool=True) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||||
|
processed_train = process_experiments(experiments_train, **params)
|
||||||
|
X_train = pd.concat([metadata_train, processed_train], axis=1)
|
||||||
|
if experiments_test is not None:
|
||||||
|
processed_test = process_experiments(experiments_test, **params)
|
||||||
|
X_test = pd.concat([metadata_test, processed_test], axis=1)
|
||||||
|
else:
|
||||||
|
X_test = None
|
||||||
|
if scale:
|
||||||
|
scaler = StandardScaler()
|
||||||
|
scaler.fit(X_train)
|
||||||
|
X_train = scaler.transform(X_train)
|
||||||
|
if X_test is not None:
|
||||||
|
X_test = scaler.transform(X_test)
|
||||||
|
return X_train, X_test
|
||||||
|
|||||||
Reference in New Issue
Block a user