Source code for cRedditscore.cRedditscore

# -*- coding: utf-8 -*-
'''
A module of tools to train and package
predictive models for the quality of comments on reddit.
'''

import pandas as pd
from sklearn import cross_validation
from sklearn import pipeline
from sklearn.feature_extraction import text
from sklearn import naive_bayes
import pickle as pk


[docs]def get_quality(score, low_thresh=0, high_thresh=15):
    '''
    Get the quality (good, bad, neutral) of a score
    based on the score thresholds.

    For example,

    >>> get_quality(score=15)
    'neutral'
    >>> get_quality(score=2, low_thresh=5, high_thresh=20)
    'bad'
    >>> get_quality(score=2, low_thresh=-10, high_thresh=1)
    'good'

    :param int score: the score of the comment
    :param int low_thresh: the low threshold of a neutral comment
    :param int high_thresh: the high threshold of a neutral comment
    :returns: the quality of the comment (good, bad or neutral)
    :rtype: string
    '''

    if score > high_thresh:
        qual = 'good'
    elif score < low_thresh:
        qual = 'bad'
    else:
        qual = 'neutral'

    return qual


[docs]class TermFreqModel(object):
    '''
    A Naive Bayes model predicting the quality of
    comments on Reddit. The data is input as a pandas dataframe
    with columns

    * *comment_id*: a unique identifier for each comment
    * *score* (int): the score of the comment
    * *content*: the comment itself

    For example, we make a small comments data set

    >>> import pandas as pd
    >>> test_df = pd.DataFrame([
    ...     [1, 1, "That's cool", 1],
    ...     [2, -3, 'boo you', 2],
    ...     [3, 4, 'I love you', 2],
    ...     [3, 16, 'I love you', 4],
    ...     ], columns=['comment_id', 'score', 'content', 'timestamp'])

    and build a `TermFreqModel` object from it.

    >>> tfm = TermFreqModel(comments_df = test_df)

    :param pandas.core.frame.DataFrame comments_df:
        the dataframe containing the comment data
    :param int low_thresh:
        the lower bound for the score of a neutral comment.
        Anything lower is considered a bad comment
    :param int high_thresh:
        the upper bound for the score of a neutral comment.
        Anything higher is considered a good comment

    '''

    def __init__(self, comments_df, low_thresh=0, high_thresh=15):
        self.comments_df = comments_df
        self.high_thresh = high_thresh
        self.low_thresh = low_thresh

        self.setup_data()

[docs]    def setup_data(self):
        '''
        Set up the data for model training.
        In detail:

        * Remove all but the most recent observation for each comment
        * Add the quality feature to the data, which will be our outcome
          variable
        * Separate out the good and bad comments. This will be the data
          we train the model on.

        This function adds a new class attribute

        * *good_bad_df*:
            the dataframe containing only
            the most recent observations of the
            good and bad comments
        '''
        # Pick only the most recent observation of each comment
        self.comments_data_set = self.most_recent_obs(self.comments_df)

        # Add the quality feature to the data
        self.add_qual_feature(self.comments_data_set)

        # Get the good and bad comments
        self.good_df, self.bad_df = self.get_good_bad(self.comments_data_set)

        # Combine good and bad comments
        self.good_bad_df = pd.concat([self.good_df, self.bad_df])

[docs]    def train_test(self, test_size=0.2):
        '''
        Split the data into train and test sets.

        This function adds new class attributes

        * *X_train* and *X_test*,
            The features of the training and test parts of the data set
        * *y_train* and *y_test*,
            The outcomes of the training and test parts of the data set

        :param int test_size:
            the percentage of data points to hold out for testing
        '''

        train_test = cross_validation.train_test_split(
            self.good_bad_df.content,
            self.good_bad_df.qual,
            test_size=test_size, random_state=0
            )

        self.X_train, self.X_test, self.y_train, self.y_test = train_test

[docs]    def make_model(self,
                   test_size=0.2,
                   ngram_range=(1, 4),
                   max_features=1000
                   ):
        '''
        Make a new class attribute

        * *model*:
            the Naive Bayes model as an *sklearn.pipeline.Pipeline* object

        For example, we make a small comments data set,

        >>> import pandas as pd
        >>> test_df = pd.DataFrame([
        ...     [1, 1, "That's cool", 1],
        ...     [2, -3, 'boo you', 2],
        ...     [3, 4, 'I love you', 2],
        ...     [3, 16, 'I love you', 4],
        ...     ], columns=['comment_id', 'score', 'content', 'timestamp'])

        build a `TermFreqModel` object from it,

        >>> tfm = TermFreqModel(test_df)

        and train a model on it predictive of the quality of a comment.

        >>> tfm.train_test(test_size=0.1)
        >>> tfm.make_model(ngram_range=(1, 3), max_features=10)
        >>> tfm.fit()
        >>> prediction = tfm.model.predict(['Thanks for a great post!'])
        >>> prediction in ['good', 'bad']
        True

        :param int test_size:
            the percentage of data points to hold out for testing
        :param tuple ngram_range:
            the range of n for ngrams to include as features
        :param int max_features:
            the maximum number of features to include
        '''

        # Make the count vectorizer
        self.cvec = text.CountVectorizer(
            ngram_range=ngram_range,
            stop_words='english',
            max_features=max_features
            )

        # Make the pipeline
        self.model = pipeline.Pipeline([
            ('vect', self.cvec),
            ('tfidf', text.TfidfTransformer(use_idf=False)),
            ('gnb', naive_bayes.MultinomialNB()),
            ])

[docs]    def fit(self):
        '''Fit the model.'''
        self.model.fit(self.X_train, self.y_train)

[docs]    def dump_model(self, pickle_name='text_mnb_model'):
        '''
        Dump the model object to file with `pickle`.

        :param string pickle_name:
            the name of the file to dump the object to
        '''
        write_file = open(pickle_name, 'w')
        pk.dump(self.model, write_file)

[docs]    def get_good_bad(self, df):
        '''
        Get the good and bad comments in a data set.

        :param pandas.core.frame.DataFrame df:
            the comments data set

        :return:
            the dataframes containing
            only the good and bad comments from `df`
        :rtype: pandas.core.frame.DataFrame, pandas.core.frame.DataFrame
        '''

        good_df = self.comments_data_set[
            self.comments_data_set.qual == 'good'
            ]

        bad_df = self.comments_data_set[
            self.comments_data_set.qual == 'bad'
            ]

        return good_df, bad_df

[docs]    def add_qual_feature(self, df):
        '''
        Add the comment quality feature to the data
        as a new column named `qual`.
        This will be our outcome variable.

        :param pandas.core.frame.DataFrame df:
            the data frame to add the `qual` column to
        '''

        qual_func = lambda x: get_quality(
            score=x,
            low_thresh=self.low_thresh,
            high_thresh=self.high_thresh)

        df['qual'] = df.score.apply(qual_func)

[docs]    def most_recent_obs(self, df):
        '''
        Select the most recent observation of each comment in
        a data set.

        :param pandas.core.frame.DataFrame df:
            The data set of comments

        :returns:
            the data set containing only the most
            recent observation of each comment in `df`
        :rtype: pandas.core.frame.DataFrame
        '''

        return df.groupby(df.comment_id).last()

[docs]    def get_data(self):
        '''
        Get the full data set of the model.

        :returns:
            the full data set underlying the model
        :rtype: pandas.core.frame.DataFrame
        '''

        return self.comments_data_set