July 11, 2014

How to Track Multiple Websites For Changes

Just enabled multiple-page support for the poor man's web-page delta tracker:


#!/Users/hdiwan/.virtualenvs/globetrekker/bin/python
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import argparse
import hashlib
import json
import logging
import pprint
import smtplib


def get_globetrekker_page(site):
    browser = webdriver.Chrome()
    browser.get(site)
    return browser


def send_mail(msg, user, password):
    server = smtplib.SMTP('smtp.gmail.com', 587)
    server.ehlo()
    server.starttls()
    server.ehlo()
    server.login(user, password)
    server.sendmail(user, user, msg)


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(description='Check a website for changes')
    argparser.add_argument('-n', '--url', type=str, default=None, help='Add URL to watcher',  action='store')
    argparser.add_argument('-l', '--list', action='store_true')
    argparser.add_argument('-u', '--user', type=str, default='hd1@jsc.d8u.us', help='Your username',  action='store')
    argparser.add_argument('-p', '--password', type=str, help='Your password', action='store')
    argparser.add_argument('-v', '--verbose', action='store_false')
    parsed = argparser.parse_args()

    if not parsed.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.FATAL)

    if parsed.url:
        new_hash = {parsed.url: 0}
        output = json.dumps(new_hash)
        logging.debug(output)
        try:
            with open('/var/tmp/.globetrekker.txt', 'r') as fin:
                data = json.load(fin)
                data.append(new_hash)
        except IOError, v:
            with open('/var/tmp/.globetrekker.txt', 'w') as fout:
                json.dump([new_hash], fout)
        exit()

    with open('/var/tmp/.globetrekker.txt', 'r') as fin:
        stored_hash_json = json.load(fin)
        logging.debug(stored_hash_json)
        if parsed.list:
            for k in stored_hash_json:
                print(k)
            exit()
    new_hashes = []
    stored_hash = stored_hash_json
    for stored_hash_ in stored_hash:
        for url in stored_hash_.keys():
            logging.debug('{} is our URL'.format(url))
            try:
                browser = get_globetrekker_page(url)
            except WebDriverException, e:
                continue
            encoding = 'ascii'
            text = browser.find_element_by_tag_name('html').text
            encoded = text.encode(encoding, errors='replace')
            logging.debug(encoded)
            decoded = encoded.decode(encoding, errors='replace')
            logging.debug(decoded)
            new_hash = hashlib.sha1(decoded).hexdigest()
            logging.debug('Calculated hash code: {}'.format(new_hash))
            logging.debug('Stored hash: {}'.format(stored_hash_[url]))
            if new_hash != stored_hash_[url]:
                logging.debug('{} changed'.format(url))
                send_mail(u'Subject: {} Change detected\r\n\r\n--H'.format(url), parsed.user, parsed.password)
                stored_hash_[url] = new_hash

    browser.quit()

    with open('/var/tmp/.globetrekker.txt', 'w') as fout:
        fout.write(json.dumps(stored_hash))

No comments:

Post a Comment