Source code for cRedditscore.evaluation

# -*- coding: utf-8 -*-
'''
A module of tools to evaluate predictive models.
'''
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import metrics as skmetrics


[docs]class EvalError(Exception): '''Errors in evaluating the model, for example a missing predict function.''' def __init__(self, msg): self.msg = msg def __str__(self): return self.msg
[docs]class GenModel(object): ''' A general (binary classification) model class, mostly for testing and explanatory purposes. :param function fit: the fit function of the model :param function predict: the prediction function of the model :param function predict_proba: the probability prediction function of the model, computes the probability that an observation belongs to the first class ''' def __init__(self, fit=None, predict=None, predict_proba=None): self.fit = fit self.predict = predict self.predict_proba = predict_proba
[docs]class Evaluate(object): ''' Evaluate a predictive binary classification model on a choice of metrics including accuracy, AUC, precision and recall. The model is assumed to have functions * *fit* (for cross validation): takes as input the training set features and responses and fits the model to the training set * *predict* (for accuracy, precision, recall and cross validation): takes as input a list of observations and outputs a list of predictions * *predict_proba* (for AUC and drawing the ROC curve): takes as input a list of observations and outputs a list of probabilities that the response belongs to the first class For example, we make a dummy test set and model: >>> model = GenModel(predict = lambda x : ['blue' for i in x]) >>> test_features = range(20) >>> test_responses = np.array( ... ['blue' if i%2==0 else 'green' for i in range(20)] ... ) and make an `Evaluate` object to test its accuracy. >>> eval = Evaluate(model) >>> eval.accuracy(test_features=test_features, ... test_responses=test_responses) 0.5 :param model: the model to evaluate, described above :param array-like data_features: the features of the data to evaluate the model on, generally the training set features :param array-like data_responses: the responses of the data to evaluate the model on :param pos_label: the class to be considered positive for auc, precision and recall; if None, the first class is picked ''' def __init__(self, model=None, data_features=None, data_responses=None, pos_label=None): self.model = model self.data_features = data_features self.data_responses = data_responses if data_responses is not None: self.classes = list(set(self.data_responses)) if len(self.classes) > 2: raise EvalError('More than two classes!') if pos_label is None: self.pos_label = self.data_responses.iloc[0] else: self.pos_label = pos_label
[docs] def cv_split(self, k=10): ''' Make k folds of the data using stratified cross validation. :param int k: The number of folds to divide the data into ''' self.k = k self.cv = cross_validation.StratifiedKFold( self.data_responses, n_folds=k )
[docs] def compute_scores( self, test_features=None, test_responses=None, metrics=['accuracy', 'auc', 'precision', 'recall', 'f1'] ): ''' Compute the metric scores for the model on the cross-validation folds of the training data or on the test data, storing the results in a dataframe. Add a new class attribute * *scores*, the results of the computations as a `pandas.core.frame.DataFrame` object :param array-like test_features: The features of the test data. If none, evaluate the model on the cv folds. :param array-like test_responses: The responses of the test data. If none, evaluate the model on the cv folds. ''' # if only one metric is passed as a string, turn it into a list if not type(metrics) is list: metrics = [metrics] # build the scores dataframe self.scores = self.build_scores_df() for i, (train, test) in enumerate(self.cv): # fit the model self.model.fit( self.data_features[train], self.data_responses[train] ) # make the predictions if there's a metric besides auc if not metrics == ['auc']: predictions = self.model.predict(self.data_features[test]) # make probability predictions if auc is a metric if 'auc' in metrics: prob_preds = self.model.predict_proba(self.data_features[test]) if 'accuracy' in metrics: acc = self.accuracy( test_responses=self.data_responses[test], predictions=predictions ) self.scores.loc['accuracy']['fold'][i] = acc if 'auc' in metrics: # add fpr, tpr and thresh to roc list fpr, tpr, thresh = skmetrics.roc_curve( y_true=self.data_responses[test], y_score=prob_preds[:, 1], pos_label=self.pos_label ) try: self.roc.append([fpr, tpr, thresh]) except: self.roc = [[fpr, tpr, thresh]] # compute the auc self.scores.loc['auc']['fold'][i] = skmetrics.auc(fpr, tpr) if 'precision' in metrics: sum_prec = 0.0 for cl in self.classes: prec = skmetrics.precision_score( y_true=self.data_responses[test], y_pred=predictions, pos_label=cl ) sum_prec += prec ind = ('precision', 'class {}'.format(cl)) self.scores.xs(ind)['fold'][i] = prec self.scores.xs(('precision', 'avg'))['fold'][i] = sum_prec/2 if 'recall' in metrics: sum_rec = 0.0 for cl in self.classes: rec = skmetrics.recall_score( y_true=self.data_responses[test], y_pred=predictions, pos_label=cl ) sum_rec += rec ind = ('recall', 'class {}'.format(cl)) self.scores.xs(ind)['fold'][i] = rec self.scores.xs(('recall', 'avg'))['fold'][i] = sum_rec/2 if 'f1' in metrics: sum_f1 = 0.0 for cl in self.classes: f1 = skmetrics.f1_score( y_true=self.data_responses[test], y_pred=predictions, pos_label=cl ) sum_f1 += f1 ind = ('f1', 'class {}'.format(cl)) self.scores.xs(ind)['fold'][i] = f1 self.scores.xs(('f1', 'avg'))['fold'][i] = sum_f1/2 # add min, max, mean, std for i in self.scores.index: self.scores.xs(i)['min'] = np.min(self.scores.xs(i)['fold']) self.scores.xs(i)['max'] = np.max(self.scores.xs(i)['fold']) self.scores.xs(i)['mean'] = np.mean(self.scores.xs(i)['fold']) self.scores.xs(i)['std'] = np.std(self.scores.xs(i)['fold'])
[docs] def build_scores_df(self): '''Build the scores dataframe.''' metrics_level_1 = pd.MultiIndex.from_tuples([('accuracy',), ('auc',)]) metrics_level_2 = pd.MultiIndex.from_product([ ['precision', 'recall', 'f1'], ['class {}'.format(cl) for cl in self.classes]+['avg'] ]) fold_columns = pd.MultiIndex.from_tuples( [('fold', i) for i in xrange(self.k)] ) agg_columns = pd.MultiIndex.from_tuples( [(agg, '') for agg in ['min', 'max', 'mean', 'std']] ) scores = pd.DataFrame( columns=fold_columns | agg_columns, index=metrics_level_1 | metrics_level_2 ) return scores
[docs] def accuracy(self, test_responses, test_features=None, predictions=None, train=None): ''' Find the accuracy of the model on a given test set. :param array-like test_responses: the responses of the test set :param array-like test_features: the features of the test set to predict on. If None, use the pre-made `predictions` :param array-like predictions: the predictions to test. If None, use `test_features` to predict on :param array-like train: the data set to train the model on (optional) ''' # check if the model and it's predict function exist if not self.model: raise EvalError('Accuracy: no model to evaluate!') if not self.model.predict: raise EvalError('Accuracy: model has no predict function!') # train the model if a training set is given and fit function exists if train: if not self.model.fit: raise EvalError('Accuracy: training set' + ' given but model has no fit function!') else: self.model.fit(train) if predictions is None: # make the prediction predictions = self.model.predict(test_features) # make sure the lengths of predictions and test_responses match up if not len(predictions) == len(test_responses): raise EvalError('The numbers of predictions' + ' and test responses do not match up!') # calculate and return the accuracy return np.mean(test_responses == predictions)
[docs] def compute_curves(self): '''Plot ROC and precision recall curves.''' pass