KPLabs commited on
Commit
ed46d2f
·
verified ·
1 Parent(s): bf2b411

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +210 -0
train.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import calculate_metrics, get_classes, CLASSES
2
+ from sklearn.ensemble import RandomForestRegressor
3
+ from sklearn.utils import shuffle
4
+ from typing import List, Tuple
5
+ from sklearn import metrics
6
+ from tqdm import tqdm
7
+ import pandas as pd
8
+ import numpy as np
9
+ import joblib
10
+ import pywt
11
+ import fire
12
+ import json
13
+ import os
14
+
15
+
16
+ TRAIN_SIZE = 1732
17
+ TEST_SIZE = 1154
18
+ TRAIN_DIR = "train_data_simulated/"
19
+ TEST_DIR = "test_data_simulated/"
20
+
21
+
22
+ def load_data() -> Tuple[List, pd.DataFrame, List, pd.DataFrame]:
23
+ X_train = [os.path.join(TRAIN_DIR, f"{i}.npz") for i in range(TRAIN_SIZE)]
24
+ X_test = [os.path.join(TEST_DIR, f"{i}.npz") for i in range(TEST_SIZE)]
25
+ y_train = pd.read_csv("train_gt.csv")
26
+ y_test = pd.read_csv("test_gt.csv")
27
+ return X_train, y_train, X_test, y_test
28
+
29
+
30
+ class SpectralCurveFiltering:
31
+ def __init__(self, merge_function=np.mean):
32
+ self.merge_function = merge_function
33
+
34
+ def __call__(self, sample: np.ndarray) -> np.ndarray:
35
+ return self.merge_function(sample, axis=(1, 2))
36
+
37
+
38
+ class BaselineRegressor:
39
+ def __init__(self):
40
+ self.mean = 0
41
+
42
+ def fit(self, X_train: np.ndarray, y_train: np.ndarray):
43
+ self.mean = np.mean(y_train, axis=0)
44
+ self.classes_count = y_train.shape[1]
45
+ return self
46
+
47
+ def predict(self, X_test: np.ndarray) -> np.ndarray:
48
+ return np.full((len(X_test), self.classes_count), self.mean)
49
+
50
+
51
+ def preprocess(samples_lst: List[str], features: List[str]) -> Tuple:
52
+ def _shape_pad(data: np.ndarray) -> np.ndarray:
53
+ """
54
+ This sub-function makes padding to have square fields sizes.
55
+ Not mandatory but eliminates the risk of calculation error
56
+ in singular value decomposition.
57
+ Padding by warping also improves the performance slightly.
58
+ """
59
+ max_edge = np.max(data.shape[1:])
60
+ shape = (max_edge, max_edge)
61
+ padded = np.pad(
62
+ data,
63
+ ((0, 0), (0, (shape[0] - data.shape[1])), (0, (shape[1] - data.shape[2]))),
64
+ "wrap",
65
+ )
66
+ return padded
67
+
68
+ filtering = SpectralCurveFiltering()
69
+ w1 = pywt.Wavelet("sym3")
70
+ w2 = pywt.Wavelet("dmey")
71
+
72
+ all_feature_names = []
73
+
74
+ for sample_index, sample_path in tqdm(
75
+ enumerate(samples_lst), total=len(samples_lst)
76
+ ):
77
+ with np.load(sample_path) as npz:
78
+ data = np.ma.MaskedArray(**npz)
79
+ data = _shape_pad(data)
80
+ # Get the spatial features:
81
+ s = np.linalg.svd(data, full_matrices=False, compute_uv=False)
82
+ s0 = s[:, 0]
83
+ s1 = s[:, 1]
84
+ s2 = s[:, 2]
85
+ s3 = s[:, 3]
86
+ s4 = s[:, 4]
87
+ dXds1 = s0 / (s1 + np.finfo(float).eps)
88
+ ffts = np.fft.fft(s0)
89
+ reals = np.real(ffts)
90
+ imags = np.imag(ffts)
91
+
92
+ # Get the specific spectral features:
93
+ data = filtering(data)
94
+
95
+ cA0, cD0 = pywt.dwt(data, wavelet=w2, mode="constant")
96
+ cAx, cDx = pywt.dwt(cA0[12:92], wavelet=w2, mode="constant")
97
+ cAy, cDy = pywt.dwt(cAx[15:55], wavelet=w2, mode="constant")
98
+ cAz, cDz = pywt.dwt(cAy[15:35], wavelet=w2, mode="constant")
99
+ cAw2 = np.concatenate((cA0[12:92], cAx[15:55], cAy[15:35], cAz[15:25]), -1)
100
+ cDw2 = np.concatenate((cD0[12:92], cDx[15:55], cDy[15:35], cDz[15:25]), -1)
101
+
102
+ cA0, cD0 = pywt.dwt(data, wavelet=w1, mode="constant")
103
+ cAx, cDx = pywt.dwt(cA0[1:-1], wavelet=w1, mode="constant")
104
+ cAy, cDy = pywt.dwt(cAx[1:-1], wavelet=w1, mode="constant")
105
+ cAz, cDz = pywt.dwt(cAy[1:-1], wavelet=w1, mode="constant")
106
+ cAw1 = np.concatenate((cA0, cAx, cAy, cAz), -1)
107
+ cDw1 = np.concatenate((cD0, cDx, cDy, cDz), -1)
108
+
109
+ dXdl = np.gradient(data, axis=0)
110
+ d2Xdl2 = np.gradient(dXdl, axis=0)
111
+ d3Xdl3 = np.gradient(d2Xdl2, axis=0)
112
+
113
+ fft = np.fft.fft(data)
114
+ real = np.real(fft)
115
+ imag = np.imag(fft)
116
+
117
+ features_to_select = {
118
+ "spatial": (dXds1, s0, s1, s2, s3, s4, reals, imags),
119
+ "fft": (real, imag),
120
+ "gradient": (dXdl, d2Xdl2, d3Xdl3),
121
+ "mean": (data,),
122
+ "dwt": (cAw1, cAw2),
123
+ }
124
+
125
+ # The best Feature combination for Random Forest based regression:
126
+ sample_features = []
127
+ sample_feature_names = []
128
+ for feature_name in features:
129
+ sample_features.extend(features_to_select[feature_name])
130
+ sample_feature_names.extend(
131
+ [feature_name]
132
+ * len(np.concatenate(features_to_select[feature_name]))
133
+ )
134
+
135
+ sample_features = np.concatenate(sample_features, -1)
136
+ samples_lst[sample_index] = sample_features
137
+ all_feature_names.append(sample_feature_names)
138
+
139
+ return np.vstack(samples_lst), all_feature_names
140
+
141
+
142
+ def runner(features: List[str] = "spatial,fft,dwt,gradient,mean".split(",")):
143
+ X_train, y_train, X_test, y_test = load_data()
144
+
145
+ X_train, train_feature_names = preprocess(X_train, features)
146
+ X_test, test_feature_names = preprocess(X_test, features)
147
+
148
+ X_train, y_train = shuffle(X_train, y_train, random_state=2023)
149
+
150
+ model = RandomForestRegressor(random_state=2023)
151
+ print(f"Training model on {X_train.shape} features...")
152
+ model = model.fit(X_train, y_train[CLASSES].values)
153
+
154
+ joblib.dump(model, f"RF_model_{'-'.join(features)}.joblib")
155
+
156
+ submission_df = pd.DataFrame(data=model.predict(X_test), columns=CLASSES)
157
+ submission_df.to_csv(",".join(features) + ".csv", index_label="sample_index")
158
+
159
+ baseline_reg = BaselineRegressor()
160
+ baseline_reg = baseline_reg.fit(X_train, y_train[CLASSES].values)
161
+ baselines_mse = np.mean(
162
+ (y_test[CLASSES].values - baseline_reg.predict(X_test)) ** 2, axis=0
163
+ )
164
+
165
+ mse = np.mean((y_test[CLASSES].values - submission_df[CLASSES].values) ** 2, axis=0)
166
+ scores = mse / baselines_mse
167
+ final_score = np.mean(scores)
168
+
169
+ r2 = metrics.r2_score(
170
+ y_true=y_test[CLASSES].values,
171
+ y_pred=submission_df[CLASSES].values,
172
+ multioutput="raw_values",
173
+ )
174
+ mse = metrics.mean_squared_error(
175
+ y_true=y_test[CLASSES].values,
176
+ y_pred=submission_df[CLASSES].values,
177
+ multioutput="raw_values",
178
+ )
179
+ mae = metrics.mean_absolute_error(
180
+ y_true=y_test[CLASSES].values,
181
+ y_pred=submission_df[CLASSES].values,
182
+ multioutput="raw_values",
183
+ )
184
+ all_metrics = calculate_metrics(
185
+ y_pred=get_classes(submission_df[CLASSES]),
186
+ y_true=get_classes(y_test[CLASSES]),
187
+ )
188
+ mse = {k + "_mse": v for k, v in zip(["P", "K", "Mg", "pH"], mse.tolist())}
189
+ r2 = {k + "_r2": v for k, v in zip(["P", "K", "Mg", "pH"], r2.tolist())}
190
+ mae = {k + "_mae": v for k, v in zip(["P", "K", "Mg", "pH"], mae.tolist())}
191
+
192
+ all_metrics["custom"] = final_score
193
+ all_metrics = pd.DataFrame.from_dict({**all_metrics, **r2, **mse, **mae})
194
+ all_metrics.to_csv(f"all_metrics.csv", index=False)
195
+
196
+ with open("all_metrics.json", "w", encoding="utf-8") as f:
197
+ json.dump(all_metrics.to_dict(), f, ensure_ascii=True, indent=4)
198
+
199
+ print(f"Custom score: {final_score}")
200
+ return final_score
201
+
202
+
203
+ if __name__ == "__main__":
204
+ fire.Fire(runner)
205
+ model = joblib.load(
206
+ f"RF_model_{'-'.join('spatial,fft,dwt,gradient,mean'.split(','))}.joblib"
207
+ )
208
+ import sklearn
209
+
210
+ assert isinstance(model, sklearn.ensemble._forest.RandomForestRegressor)