September 24, 2014

How to Script the Web

A tall order, but this is the first step... Converting HTML to JSON, using python, naturally:

#!/usr/bin/python
from HTMLParser import HTMLParser
import logging
import sys
import urllib2


class HTMLtoJSONParser(HTMLParser):
    def __init__(self, raise_exception=True):
        HTMLParser.__init__(self)
        self.doc = {}
        self.path = []
        self.cur = self.doc
        self.line = 0
        self.raise_exception = raise_exception

    @property
    def json(self):
        return self.doc

    @staticmethod
    def to_json(content, raise_exception=False):
        parser = HTMLtoJSONParser(raise_exception=raise_exception)
        parser.feed(content)
        return parser.json

    def handle_starttag(self, tag, attrs):
        self.path.append(tag)
        attrs = {k: v for k, v in attrs}
        if tag in self.cur:
            if isinstance(self.cur[tag], list):
                self.cur[tag].append({"__parent__": self.cur})
                self.cur = self.cur[tag][-1]
            else:
                self.cur[tag] = [self.cur[tag]]
                self.cur[tag].append({"__parent__": self.cur})
                self.cur = self.cur[tag][-1]
        else:
            self.cur[tag] = {"__parent__": self.cur}
            self.cur = self.cur[tag]

        for a, v in attrs.items():
            self.cur["#" + a] = v
        self.cur[""] = ""

    def handle_endtag(self, tag):
        if tag != self.path[-1]:
            print('{} not closed, on line {0}'.format(tag, self.line))
            if self.raise_exception:
                raise Exception("html is malformed around line: {0} (it might be because of a tag <br>, <hr>, <img .. > not closed)".format(self.line))
        del self.path[-1]
        memo = self.cur
        self.cur = self.cur["__parent__"]
        self.clean(memo)

    def handle_data(self, data):
        self.line += data.count("\n")
        if "" in self.cur:
            self.cur[""] += data

    def clean(self, values):
        keys = list(values.keys())
        for k in keys:
            v = values[k]
            if isinstance(v, str):
                logging.debug("clean: {}, {}".format(k, [v]))
                c = v.strip(" \n\r\t")
                if c != v:
                    if len(c) > 0:
                        values[k] = c
                    else:
                        del values[k]
        del values["__parent__"]


if __name__ == '__main__':
    logging.basicConfig(level=logging.FATAL)
    logging.debug(sys.argv[0])
    url = urllib2.urlopen(sys.argv[1])
    print(HTMLtoJSONParser.to_json(url.read()))

No comments:

Post a Comment