| import numpy as np |
| import util |
|
|
| |
| WILDCARD = 'X' |
|
|
| def main_LogReg(train_path, valid_path, save_path): |
| """Problem (1b): Logistic regression with Newton's Method. |
| |
| Args: |
| train_path: Path to CSV file containing dataset for training. |
| valid_path: Path to CSV file containing dataset for validation. |
| save_path: Path to save predicted probabilities using np.savetxt(). |
| """ |
| |
| x_train, y_train = util.load_dataset(train_path, add_intercept=True) |
|
|
| |
| clf = LogisticRegression() |
| clf.fit(x_train, y_train) |
|
|
| |
| x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True) |
| plot_path = save_path.replace('.txt', '.png') |
| util.plot(x_eval, y_eval, clf.theta, plot_path) |
|
|
| |
| p_eval = clf.predict(x_eval) |
| yhat = p_eval > 0.5 |
| print('LR Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1))) |
| np.savetxt(save_path, p_eval) |
|
|
| class LogisticRegression: |
| """Logistic regression with Newton's Method as the solver. |
| |
| Example usage: |
| > clf = LogisticRegression() |
| > clf.fit(x_train, y_train) |
| > clf.predict(x_eval) |
| """ |
| def __init__(self, step_size=0.01, max_iter=1000000, eps=1e-5, |
| theta_0=None, verbose=True): |
| """ |
| Args: |
| step_size: Step size for iterative solvers only. |
| max_iter: Maximum number of iterations for the solver. |
| eps: Threshold for determining convergence. |
| theta_0: Initial guess for theta. If None, use the zero vector. |
| verbose: Print loss values during training. |
| """ |
| self.theta = theta_0 |
| self.step_size = step_size |
| self.max_iter = max_iter |
| self.eps = eps |
| self.verbose = verbose |
|
|
| def gradient(self,x, y): |
| n_examples, dim = x.shape |
| logits = self.sigmoid(x) |
| |
| gradient = 1 / n_examples * x.T @ (logits - y) |
| return gradient |
|
|
| def hessian(self, x, y): |
| n_examples, dim = x.shape |
| |
| logits = self.sigmoid(x) |
|
|
| |
| |
| |
| |
|
|
| |
| main_diagonal = np.diag(logits * (1 - logits)) |
| hessian = 1 / n_examples * x.T @ main_diagonal @ x |
| return hessian |
|
|
| def loss(self, x, y): |
| |
| |
| n_examples, dim = x.shape |
| |
| logits = self.sigmoid(x) |
|
|
| loss = -np.mean(y * np.log(logits) + (1 + y) * np.log(1 - logits)) |
| return loss |
|
|
|
|
| def sigmoid(self, x): |
| |
| return 1 / (1 + np.exp(- x @ self.theta)) |
|
|
|
|
| def fit(self, x, y): |
| """Run Newton's Method to minimize J(theta) for logistic regression. |
| |
| Args: |
| x: Training example inputs. Shape (n_examples, dim). |
| y: Training example labels. Shape (n_examples,). |
| """ |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| n_examples, dim = x.shape |
| if self.theta is None: |
| self.theta = np.zeros(dim) |
| |
| |
| |
| |
|
|
| |
| |
| |
| for i in range(self.max_iter): |
| |
|
|
| gradient = self.gradient(x, y) |
| hessian = self.hessian(x, y) |
|
|
| |
| theta_prev = np.copy(self.theta) |
| |
| |
| |
| self.theta -= self.step_size * np.linalg.inv(hessian).dot(gradient) |
|
|
| if np.sum(np.abs(theta_prev - self.theta)) < self.eps: |
| break |
|
|
| |
|
|
| def predict(self, x): |
| """Return predicted probabilities given new inputs x. |
| |
| Args: |
| x: Inputs of shape (n_examples, dim). |
| |
| Returns: |
| Outputs of shape (n_examples,). |
| """ |
| |
| |
| |
| prediction = self.sigmoid(x) |
| return prediction |
| |
|
|
| def main_GDA(train_path, valid_path, save_path): |
| """Problem (1e): Gaussian discriminant analysis (GDA) |
| |
| Args: |
| train_path: Path to CSV file containing dataset for training. |
| valid_path: Path to CSV file containing dataset for validation. |
| save_path: Path to save predicted probabilities using np.savetxt(). |
| """ |
| |
| x_train, y_train = util.load_dataset(train_path, add_intercept=False) |
|
|
| |
| clf = GDA() |
| clf.fit(x_train, y_train) |
|
|
| |
| x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False) |
| plot_path = save_path.replace('.txt', '.png') |
| util.plot(x_eval, y_eval, clf.theta, plot_path) |
| x_eval = util.add_intercept(x_eval) |
|
|
| |
| p_eval = clf.predict(x_eval) |
| yhat = p_eval > 0.5 |
| print('GDA Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1))) |
| np.savetxt(save_path, p_eval) |
|
|
| class GDA: |
| """Gaussian Discriminant Analysis. |
| |
| Example usage: |
| > clf = GDA() |
| > clf.fit(x_train, y_train) |
| > clf.predict(x_eval) |
| """ |
| def __init__(self, step_size=0.01, max_iter=10000, eps=1e-5, |
| theta_0=None, verbose=True): |
| """ |
| Args: |
| step_size: Step size for iterative solvers only. |
| max_iter: Maximum number of iterations for the solver. |
| eps: Threshold for determining convergence. |
| theta_0: Initial guess for theta. If None, use the zero vector. |
| verbose: Print loss values during training. |
| """ |
| self.theta = theta_0 |
| self.step_size = step_size |
| self.max_iter = max_iter |
| self.eps = eps |
| self.verbose = verbose |
|
|
| def sigmoid(self, x): |
| |
| return 1 / (1 + np.exp(- x @ self.theta)) |
|
|
|
|
| def fit(self, x, y): |
| """Fit a GDA model to training set given by x and y by updating |
| self.theta. |
| |
| Args: |
| x: Training example inputs. Shape (n_examples, dim). |
| y: Training example labels. Shape (n_examples,). |
| """ |
| |
| n_examples, dim = x.shape |
|
|
| |
| phi = 1 / n_examples * np.sum(y == 1) |
| mu_0 = (y == 0).dot(x) / np.sum(y == 0) |
| mu_1 = (y == 1).dot(x) / np.sum(y == 1) |
| mu_yi = np.where(np.expand_dims(y == 0, -1), |
| np.expand_dims(mu_0, 0), |
| np.expand_dims(mu_1, 0)) |
| sigma = 1 / n_examples * (x - mu_yi).T.dot(x - mu_yi) |
|
|
| |
| self.theta = np.zeros(dim + 1) |
| sigma_inv = np.linalg.inv(sigma) |
| mu_diff = mu_0.T.dot(sigma_inv).dot(mu_0) - mu_1.T.dot(sigma_inv).dot(mu_1) |
| self.theta[0] = 1 / 2 * mu_diff - np.log((1 - phi) / phi) |
| self.theta[1:] = -sigma_inv.dot(mu_0 - mu_1) |
|
|
| |
|
|
| def predict(self, x): |
| """Make a prediction given new inputs x. |
| |
| Args: |
| x: Inputs of shape (n_examples, dim). |
| |
| Returns: |
| Outputs of shape (n_examples,). |
| """ |
| |
| prediction = self.sigmoid(x) |
| return prediction |
| |
|
|
| def main_posonly(train_path, valid_path, test_path, save_path): |
| """Problem 2: Logistic regression for incomplete, positive-only labels. |
| |
| Run under the following conditions: |
| 1. on t-labels, |
| 2. on y-labels, |
| 3. on y-labels with correction factor alpha. |
| |
| Args: |
| train_path: Path to CSV file containing training set. |
| valid_path: Path to CSV file containing validation set. |
| test_path: Path to CSV file containing test set. |
| save_path: Path to save predictions. |
| |
| NOTE: You need to complete logreg implementation first (see class above)!!! |
| """ |
| output_path_true = save_path.replace(WILDCARD, 'true') |
| output_path_naive = save_path.replace(WILDCARD, 'naive') |
| output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') |
|
|
| plot_path = save_path.replace('.txt', '.png') |
| plot_path_true = plot_path.replace(WILDCARD, 'true') |
| plot_path_naive = plot_path.replace(WILDCARD, 'naive') |
| plot_path_adjusted = plot_path.replace(WILDCARD, 'adjusted') |
|
|
| |
| full_predictions = fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true) |
|
|
| |
| naive_predictions, clf = naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive) |
|
|
| |
| alpha = find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions) |
|
|
| return |
|
|
| def fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true): |
| """ |
| Problem (2a): Fully Observable Binary Classification Helper Function |
| |
| Args: |
| train_path: Path to CSV file containing dataset for training. |
| test_path: Path to CSV file containing dataset for testing. |
| output_path_true: Path to save observed predictions |
| plot_path_true: Path to save the plot using plot_posonly util function |
| Return: |
| full_predictions: tensor of predictions returned from applied LogReg classifier prediction |
| """ |
| full_predictions = None |
| |
| |
| |
| x_train, t_train = util.load_dataset(train_path, label_col='t', |
| add_intercept=True) |
| clf = LogisticRegression() |
| clf.fit(x_train, t_train) |
|
|
| x_test, t_test = util.load_dataset(test_path, label_col='t', |
| add_intercept=True) |
|
|
| full_predictions = clf.predict(x_test) |
| np.savetxt(output_path_true, full_predictions) |
| util.plot(x_test, t_test, clf.theta, plot_path_true) |
| |
| return full_predictions |
|
|
| def naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive): |
| """ |
| Problem (2b): Naive Partial Labels Binary Classification Helper Function |
| |
| Args: |
| train_path: Path to CSV file containing dataset for training. |
| test_path: Path to CSV file containing dataset for testing. |
| output_path_naive: Path to save observed predictions |
| plot_path_naive: Path to save the plot using plot_posonly util function |
| Return: |
| naive_predictions: tensor of predictions returned from applied LogReg prediction |
| clf: Logistic Regression classifier (will be reused for 2f) |
| """ |
| naive_predictions = None |
| clf = None |
| |
| |
| |
| x_train, y_train = util.load_dataset(train_path, label_col='y', |
| add_intercept=True) |
| clf = LogisticRegression() |
| clf.fit(x_train, y_train) |
| x_test, t_test = util.load_dataset(test_path, label_col='t', |
| add_intercept=True) |
| naive_predictions = clf.predict(x_test) |
| np.savetxt(output_path_naive, naive_predictions) |
| util.plot(x_test, t_test, clf.theta, plot_path_naive) |
| |
| return naive_predictions, clf |
|
|
| def find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions): |
| """ |
| Problem (2f): Alpha Correction Binary Classification Helper Function |
| |
| Args: |
| clf: Logistic regression classifier from part 2b |
| valid_path: Path to CSV file containing dataset for validation. |
| test_path: Path to CSV file containing dataset for testing. |
| output_path_adjusted: Path to save observed predictions |
| plot_path_adjusted: Path to save the plot using plot_posonly util function |
| naive_predictions: tensor of predictions returned from applied LogReg prediction from 2b |
| Return: |
| alpha: corrected alpha value |
| """ |
| alpha = None |
| |
| |
| |
| x_valid, y_valid = util.load_dataset(valid_path, label_col='y') |
| x_valid = x_valid[y_valid == 1, :] |
| x_valid = util.add_intercept(x_valid) |
| y_pred = clf.predict(x_valid) |
| alpha = np.mean(y_pred) |
| print('Found alpha = {}'.format(alpha)) |
| x_test, t_test = util.load_dataset(test_path, label_col='t', |
| add_intercept=True) |
|
|
| |
| np.savetxt(output_path_adjusted, naive_predictions / alpha) |
| util.plot(x_test, t_test, clf.theta, plot_path_adjusted, correction=alpha) |
| |
| return alpha |
|
|
| if __name__ == '__main__': |
| ''' |
| Start of Problem 1: Linear Classifiers |
| ''' |
| |
| main_LogReg(train_path='ds1_train.csv', |
| valid_path='ds1_valid.csv', |
| save_path='logreg_pred_1.txt') |
| main_LogReg(train_path='ds2_train.csv', |
| valid_path='ds2_valid.csv', |
| save_path='logreg_pred_2.txt') |
| |
| main_GDA(train_path='ds1_train.csv', |
| valid_path='ds1_valid.csv', |
| save_path='gda_pred_1.txt') |
| main_GDA(train_path='ds2_train.csv', |
| valid_path='ds2_valid.csv', |
| save_path='gda_pred_2.txt') |
| |
| ''' |
| Start of Problem 2: Incomplete, Positive-Only Labels |
| ''' |
| main_posonly(train_path='train.csv', |
| valid_path='valid.csv', |
| test_path='test.csv', |
| save_path='posonly_X_pred.txt') |
|
|