September 20, 2014

How to Visualise Email Traffic

Tonight I went back to some earlier data wanking I'd done on my email. Namely, "what hour do I receive the most email?" except this time, there was no two scripts -- the entire program is 57 lines of code, a lot of which is logging (I like to see what's going on, ok?):

#!python
import argparse
import email
import ggplot
import pyimgur
import logging
import os
import pandas as pd
import poplib
import tempfile

if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description='Which hours does one receive gmail?')
    argparser.add_argument('-s', '--server', action='store', help='POP3 SSL server, defaults to pop.gmail.com', default='pop.gmail.com')
    argparser.add_argument('-u', '--user', action='store', help='Username, fully qualified with domain')
    argparser.add_argument('-p', '--password', action='store', help='Password for username, not stored')
    argparser.add_argument('-v', '--verbose', help='Enable debugging', action='store_true')
    arguments = argparser.parse_args()

    if arguments.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.FATAL)

    # connect to host using SSL
    pop3_mail = poplib.POP3_SSL(arguments.server)

    # print the response message from server
    print pop3_mail.getwelcome()

    # send user
    pop3_mail.user(arguments.user)

    # send password
    pop3_mail.pass_(arguments.password)
    logging.debug('logged in to mail')
    data = pop3_mail.stat()[0]

    logging.debug('All selected')
    dates = pd.DataFrame(columns=['Hour of Day', 'Number of Messages'])
    dates['Hour of Day'] = xrange(1, 25)
    dates['Number of Messages'] = [0 for r in xrange(1, 25)]
    logging.debug('data frame initalised')
    logging.debug('{} total messages'.format(data))
    logging.debug('iterating through messages')
    for message_number in xrange(1, data):
        raw_email = pop3_mail.retr(message_number)[1]
        logging.debug('Email: {}'.format(raw_email))
        message = email.message_from_string('\n'.join(raw_email))
        date = email.utils.parsedate_tz(message['Date'])
        logging.debug('Date: {}'.format(date))
        hour = date[3]
        logging.debug('Hour: {}'.format(hour))
        dates.xs(hour, copy=False)['Number of Messages'] = dates.xs(hour)['Number of Messages']+1
    logging.debug(dates)
    plot = ggplot.ggplot(dates, ggplot.aes(x='Number of Messages')) + ggplot.geom_bar() + ggplot.ggtitle('Hour of Day for emails to {}'.format(arguments.user)) + ggplot.xlab('Message Count / hour') + ggplot.theme_seaborn()
    logging.debug('generated plot')
    out = tempfile.NamedTemporaryFile(prefix='mail', suffix='.png')
    ggplot.ggsave(filename=out.name, plot=plot)
    im = pyimgur.Imgur(u'72a4e1b18bf0d6b')
    uploaded_image = im.upload_image(out.name, title="Uploaded with PyImgur")
    print('Plot available at {}'.format(uploaded_image.link))

A sample run results in the following image:

No comments:

Post a Comment