2014-04-06 11:06:23 +01:00
|
|
|
import os
|
2014-04-26 14:04:57 +01:00
|
|
|
import string
|
2014-04-06 13:23:46 +01:00
|
|
|
import HTMLParser, urllib, urlparse
|
|
|
|
|
|
2014-04-06 16:16:39 +01:00
|
|
|
class JargonFile(dict):
|
|
|
|
|
def __init__(self,*arg,**kw):
|
|
|
|
|
super(CustomDictOne, self).__init__(*arg, **kw)
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, key):
|
|
|
|
|
val = dict.__getitem__(self, key)
|
|
|
|
|
print 'GET', key
|
|
|
|
|
return val
|
|
|
|
|
|
|
|
|
|
def __setitem__(self, key, val):
|
|
|
|
|
print 'SET', key, val
|
|
|
|
|
dict.__setitem__(self, key, val)
|
|
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
|
dictrepr = dict.__repr__(self)
|
|
|
|
|
return '%s(%s)' % (type(self).__name__, dictrepr)
|
|
|
|
|
|
|
|
|
|
def update(self, *args, **kwargs):
|
|
|
|
|
print 'update', args, kwargs
|
|
|
|
|
for k, v in dict(*args, **kwargs).iteritems():
|
|
|
|
|
self[k] = v
|
|
|
|
|
|
2014-04-06 13:23:46 +01:00
|
|
|
class JargonParser(HTMLParser.HTMLParser):
|
|
|
|
|
def __init__ (self):
|
|
|
|
|
HTMLParser.HTMLParser.__init__ (self)
|
|
|
|
|
self.seen = {}
|
|
|
|
|
self.currentSection=''
|
|
|
|
|
self.title = ''
|
2014-04-06 15:42:45 +01:00
|
|
|
self.bodyText = ''
|
2014-04-06 13:23:46 +01:00
|
|
|
def handle_data(self, data):
|
2014-04-06 15:42:45 +01:00
|
|
|
if "head" in self.currentSection:
|
|
|
|
|
# store the title
|
2014-04-26 14:04:57 +01:00
|
|
|
self.title = data.strip()
|
2014-04-06 15:42:45 +01:00
|
|
|
self.bodyText = '';
|
|
|
|
|
elif "body" in self.currentSection:
|
|
|
|
|
replacements = [' ',' ',' ','\t','\r','\n']
|
|
|
|
|
for rep in replacements:
|
|
|
|
|
data = data.replace(rep,' ')
|
|
|
|
|
data = data.strip()
|
|
|
|
|
self.bodyText = self.bodyText + data + ' '
|
2014-04-06 13:23:46 +01:00
|
|
|
def handle_starttag(self, tag, attributes):
|
|
|
|
|
if "head" in tag or "body" in tag:
|
|
|
|
|
self.currentSection = tag;
|
2014-04-06 15:42:45 +01:00
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
# Further sanitise the returned text
|
|
|
|
|
def jargonSaneText(title, text):
|
2014-04-06 15:42:45 +01:00
|
|
|
if len(text) < 2:
|
|
|
|
|
return ''
|
|
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
# usually in the format (title : text)
|
2014-04-06 15:42:45 +01:00
|
|
|
initsplit = text.split(' : ')
|
|
|
|
|
if len(initsplit) < 2:
|
2014-04-26 14:04:57 +01:00
|
|
|
# sometimes in the format (title[blurb] text)
|
|
|
|
|
initsplit = text.split('] ')
|
|
|
|
|
if len(initsplit) < 2:
|
|
|
|
|
# sometimes in the format (title adj. text)
|
|
|
|
|
initsplit = text.split(' adj. ')
|
|
|
|
|
|
|
|
|
|
# is all else fails look for the second instance of the title text
|
|
|
|
|
if len(initsplit) < 2:
|
|
|
|
|
testsplit = text.split(title)
|
|
|
|
|
if len(testsplit) >= 3:
|
|
|
|
|
initsplit = testsplit
|
|
|
|
|
initsplit[1] = ''
|
|
|
|
|
testsplitctr = 0
|
|
|
|
|
for txt in testsplit:
|
|
|
|
|
if txt == ' ':
|
|
|
|
|
txt = title
|
|
|
|
|
if testsplitctr >= 2:
|
|
|
|
|
if testsplitctr >= 3:
|
|
|
|
|
initsplit[1] = initsplit[1] + ' '
|
|
|
|
|
initsplit[1] = initsplit[1] + txt
|
|
|
|
|
testsplitctr = testsplitctr + 1
|
|
|
|
|
|
|
|
|
|
if len(initsplit) < 2:
|
|
|
|
|
return ''
|
2014-04-06 15:42:45 +01:00
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
# get the second part of the split array (i.e. the description text)
|
|
|
|
|
text = initsplit[1]
|
2014-04-06 15:42:45 +01:00
|
|
|
|
|
|
|
|
sentsplit = text.split('.')
|
|
|
|
|
if len(sentsplit) > 1:
|
|
|
|
|
ctr = 0
|
|
|
|
|
newtext = ''
|
|
|
|
|
for sent in sentsplit:
|
|
|
|
|
if ctr < len(sentsplit)-1:
|
|
|
|
|
newtext = newtext + sent + '.'
|
|
|
|
|
ctr = ctr + 1
|
|
|
|
|
text = newtext
|
|
|
|
|
|
2014-04-06 16:16:39 +01:00
|
|
|
text = text.replace(' . ','. ')
|
|
|
|
|
text = text.replace(' .','. ')
|
|
|
|
|
text = text.replace(' ',' ')
|
2014-04-26 14:04:57 +01:00
|
|
|
text = filter(lambda x: x in string.printable, text)
|
2014-04-06 16:16:39 +01:00
|
|
|
|
|
|
|
|
return text.strip()
|
2014-04-06 11:06:23 +01:00
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
def validTitle(title):
|
|
|
|
|
if title is '':
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if '\xc2' in title:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if title.startswith("Letters"):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if title.startswith("Comments"):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if title.startswith("Glossary"):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
# remove any invalid characters from an entry title
|
|
|
|
|
# so thst it can be saved in a filename
|
|
|
|
|
def jargonSaneTitle(title):
|
|
|
|
|
if '/' in title:
|
|
|
|
|
title = title.replace('/','-')
|
|
|
|
|
return title
|
|
|
|
|
|
2014-04-26 16:54:15 +01:00
|
|
|
# limit line lengths so that entries are more readable
|
|
|
|
|
def jargonPageColumns(text, columns):
|
|
|
|
|
words = text.split(' ')
|
|
|
|
|
lineLen = 0
|
|
|
|
|
line = ''
|
|
|
|
|
result = ''
|
|
|
|
|
for word in words:
|
|
|
|
|
if lineLen + len(word) + 1 >= columns:
|
|
|
|
|
result = result + line + '\n'
|
|
|
|
|
line = ''
|
|
|
|
|
lineLen = 0
|
|
|
|
|
lineLen = lineLen + len(word) + 1
|
|
|
|
|
if line == '':
|
|
|
|
|
line = word
|
|
|
|
|
else:
|
|
|
|
|
line = line + ' ' + word
|
|
|
|
|
|
|
|
|
|
return result + line + '\n'
|
|
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
def jargonCreateEntry(title, text, outputDir):
|
|
|
|
|
# create the filename for the entry
|
|
|
|
|
filename = outputDir
|
|
|
|
|
if not outputDir.endswith('/'):
|
|
|
|
|
filename = filename + '/'
|
2014-04-26 15:27:41 +01:00
|
|
|
filename = filename + jargonSaneTitle(title) + '.txt'
|
2014-04-26 14:04:57 +01:00
|
|
|
|
|
|
|
|
# don't overwrite existing files
|
|
|
|
|
if os.path.isfile(filename):
|
|
|
|
|
return ''
|
|
|
|
|
|
2014-04-26 16:54:15 +01:00
|
|
|
text = jargonPageColumns(text, 78)
|
|
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
fp = open(filename, 'w')
|
|
|
|
|
fp.write(title + '\n\n' + text + '\n')
|
|
|
|
|
fp.close
|
|
|
|
|
return filename
|
|
|
|
|
|
2014-04-26 15:11:49 +01:00
|
|
|
def jargonReadFile(filename, exclusions, outputDir):
|
2014-04-06 11:06:23 +01:00
|
|
|
inFile = open(filename)
|
|
|
|
|
buffer = ""
|
|
|
|
|
for line in inFile:
|
|
|
|
|
buffer = buffer + line
|
2014-04-06 13:23:46 +01:00
|
|
|
parser = JargonParser()
|
2014-04-06 11:06:23 +01:00
|
|
|
parser.feed(buffer)
|
2014-04-26 14:04:57 +01:00
|
|
|
if validTitle(parser.title) and \
|
2014-04-06 15:42:45 +01:00
|
|
|
parser.bodyText is not '' and \
|
|
|
|
|
len(parser.title) > 1:
|
2014-04-26 14:04:57 +01:00
|
|
|
saneBodyText = jargonSaneText(parser.title, parser.bodyText)
|
2014-04-26 15:27:41 +01:00
|
|
|
if not ((jargonSaneTitle(parser.title) in exclusions) or \
|
|
|
|
|
(parser.title in exclusions)):
|
2014-04-26 15:14:16 +01:00
|
|
|
entryFilename = jargonCreateEntry(parser.title, saneBodyText, outputDir)
|
|
|
|
|
if entryFilename is not '':
|
|
|
|
|
print entryFilename
|
2014-04-26 15:11:49 +01:00
|
|
|
|
|
|
|
|
# read original jargon file entries to be excluded
|
|
|
|
|
def jargonReadExclusions(filename):
|
|
|
|
|
if len(filename) == 0:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
if not os.path.isfile(filename):
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
exclusions = []
|
|
|
|
|
with open(filename) as fp:
|
|
|
|
|
exclusions = fp.readlines()
|
|
|
|
|
fp.close()
|
|
|
|
|
|
2014-04-26 15:27:41 +01:00
|
|
|
tempExclusions = []
|
|
|
|
|
for i in range(len(exclusions)):
|
|
|
|
|
tempExclusions.append(exclusions[i].strip('\n'))
|
|
|
|
|
exclusions = tempExclusions
|
|
|
|
|
|
2014-04-26 15:11:49 +01:00
|
|
|
return exclusions
|
2014-04-06 11:06:23 +01:00
|
|
|
|
2014-04-26 14:04:57 +01:00
|
|
|
def jargonImport(rootDir, excludeEntriesFilename, outputDir):
|
2014-04-26 15:11:49 +01:00
|
|
|
exclusions = jargonReadExclusions(excludeEntriesFilename)
|
|
|
|
|
|
|
|
|
|
for dirName, subdirList, fileList in os.walk(rootDir):
|
|
|
|
|
for filename in fileList:
|
|
|
|
|
jargonReadFile(dirName + '/' + filename, exclusions, outputDir)
|
2014-04-06 11:06:23 +01:00
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2014-04-26 15:27:41 +01:00
|
|
|
jargonImport('../original','exclusions.txt','../entries')
|