A tall order, but this is the first step... Converting HTML to JSON, using python, naturally:
#!/usr/bin/python
from HTMLParser import HTMLParser
import logging
import sys
import urllib2
class HTMLtoJSONParser(HTMLParser):
def __init__(self, raise_exception=True):
HTMLParser.__init__(self)
self.doc = {}
self.path = []
self.cur = self.doc
self.line = 0
self.raise_exception = raise_exception
@property
def json(self):
return self.doc
@staticmethod
def to_json(content, raise_exception=False):
parser = HTMLtoJSONParser(raise_exception=raise_exception)
parser.feed(content)
return parser.json
def handle_starttag(self, tag, attrs):
self.path.append(tag)
attrs = {k: v for k, v in attrs}
if tag in self.cur:
if isinstance(self.cur[tag], list):
self.cur[tag].append({"__parent__": self.cur})
self.cur = self.cur[tag][-1]
else:
self.cur[tag] = [self.cur[tag]]
self.cur[tag].append({"__parent__": self.cur})
self.cur = self.cur[tag][-1]
else:
self.cur[tag] = {"__parent__": self.cur}
self.cur = self.cur[tag]
for a, v in attrs.items():
self.cur["#" + a] = v
self.cur[""] = ""
def handle_endtag(self, tag):
if tag != self.path[-1]:
print('{} not closed, on line {0}'.format(tag, self.line))
if self.raise_exception:
raise Exception("html is malformed around line: {0} (it might be because of a tag <br>, <hr>, <img .. > not closed)".format(self.line))
del self.path[-1]
memo = self.cur
self.cur = self.cur["__parent__"]
self.clean(memo)
def handle_data(self, data):
self.line += data.count("\n")
if "" in self.cur:
self.cur[""] += data
def clean(self, values):
keys = list(values.keys())
for k in keys:
v = values[k]
if isinstance(v, str):
logging.debug("clean: {}, {}".format(k, [v]))
c = v.strip(" \n\r\t")
if c != v:
if len(c) > 0:
values[k] = c
else:
del values[k]
del values["__parent__"]
if __name__ == '__main__':
logging.basicConfig(level=logging.FATAL)
logging.debug(sys.argv[0])
url = urllib2.urlopen(sys.argv[1])
print(HTMLtoJSONParser.to_json(url.read()))
September 24, 2014
How to Script the Web
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment