diff --git a/import/importjargon.py b/import/importjargon.py index 9945be5..173e976 100644 --- a/import/importjargon.py +++ b/import/importjargon.py @@ -1,20 +1,40 @@ import os -import HTMLParser +import HTMLParser, urllib, urlparse + +class JargonParser(HTMLParser.HTMLParser): + def __init__ (self): + HTMLParser.HTMLParser.__init__ (self) + self.seen = {} + self.currentSection='' + self.title = '' + def handle_data(self, data): + if self.currentSection is not '': + if "head" in self.currentSection: + # store the title + self.title = data + print "Title: " + self.title + else: + print self.currentSection + ": " + data + def handle_endtag(self, tag): + if "head" in self.currentSection or "body" in self.currentSection: + currentSection = ''; + def handle_starttag(self, tag, attributes): + if "head" in tag or "body" in tag: + self.currentSection = tag; + #print "Tag: " + tag def jargonReadFile(filename): inFile = open(filename) buffer = "" for line in inFile: buffer = buffer + line - parser = HTMLParser.HTMLParser() + parser = JargonParser() parser.feed(buffer) def jargonImport(rootDir): for dirName, subdirList, fileList in os.walk(rootDir): - print('Found directory: %s' % dirName) for filename in fileList: - print('\t%s' % filename) jargonReadFile(dirName + '/' + filename) if __name__ == "__main__": - jargonImport('original') + jargonImport('../original')