Files
JargonFile/import/importjargon.py

210 lines
5.8 KiB
Python
Raw Normal View History

2014-04-06 11:06:23 +01:00
import os
2014-04-26 14:04:57 +01:00
import string
2014-04-06 13:23:46 +01:00
import HTMLParser, urllib, urlparse
2014-04-06 16:16:39 +01:00
class JargonFile(dict):
def __init__(self,*arg,**kw):
super(CustomDictOne, self).__init__(*arg, **kw)
def __getitem__(self, key):
val = dict.__getitem__(self, key)
print 'GET', key
return val
def __setitem__(self, key, val):
print 'SET', key, val
dict.__setitem__(self, key, val)
def __repr__(self):
dictrepr = dict.__repr__(self)
return '%s(%s)' % (type(self).__name__, dictrepr)
def update(self, *args, **kwargs):
print 'update', args, kwargs
for k, v in dict(*args, **kwargs).iteritems():
self[k] = v
2014-04-06 13:23:46 +01:00
class JargonParser(HTMLParser.HTMLParser):
def __init__ (self):
HTMLParser.HTMLParser.__init__ (self)
self.seen = {}
self.currentSection=''
self.title = ''
2014-04-06 15:42:45 +01:00
self.bodyText = ''
2014-04-06 13:23:46 +01:00
def handle_data(self, data):
2014-04-06 15:42:45 +01:00
if "head" in self.currentSection:
# store the title
2014-04-26 14:04:57 +01:00
self.title = data.strip()
2014-04-06 15:42:45 +01:00
self.bodyText = '';
elif "body" in self.currentSection:
replacements = [' ',' ',' ','\t','\r','\n']
for rep in replacements:
data = data.replace(rep,' ')
data = data.strip()
self.bodyText = self.bodyText + data + ' '
2014-04-06 13:23:46 +01:00
def handle_starttag(self, tag, attributes):
if "head" in tag or "body" in tag:
self.currentSection = tag;
2014-04-06 15:42:45 +01:00
2014-04-26 14:04:57 +01:00
# Further sanitise the returned text
def jargonSaneText(title, text):
2014-04-06 15:42:45 +01:00
if len(text) < 2:
return ''
2014-04-26 14:04:57 +01:00
# usually in the format (title : text)
2014-04-06 15:42:45 +01:00
initsplit = text.split(' : ')
if len(initsplit) < 2:
2014-04-26 14:04:57 +01:00
# sometimes in the format (title[blurb] text)
initsplit = text.split('] ')
if len(initsplit) < 2:
# sometimes in the format (title adj. text)
initsplit = text.split(' adj. ')
# is all else fails look for the second instance of the title text
if len(initsplit) < 2:
testsplit = text.split(title)
if len(testsplit) >= 3:
initsplit = testsplit
initsplit[1] = ''
testsplitctr = 0
for txt in testsplit:
if txt == ' ':
txt = title
if testsplitctr >= 2:
if testsplitctr >= 3:
initsplit[1] = initsplit[1] + ' '
initsplit[1] = initsplit[1] + txt
testsplitctr = testsplitctr + 1
if len(initsplit) < 2:
return ''
2014-04-06 15:42:45 +01:00
2014-04-26 14:04:57 +01:00
# get the second part of the split array (i.e. the description text)
text = initsplit[1]
2014-04-06 15:42:45 +01:00
sentsplit = text.split('.')
if len(sentsplit) > 1:
ctr = 0
newtext = ''
for sent in sentsplit:
if ctr < len(sentsplit)-1:
newtext = newtext + sent + '.'
ctr = ctr + 1
text = newtext
2014-04-06 16:16:39 +01:00
text = text.replace(' . ','. ')
text = text.replace(' .','. ')
text = text.replace(' ',' ')
2014-04-26 14:04:57 +01:00
text = filter(lambda x: x in string.printable, text)
2014-04-06 16:16:39 +01:00
return text.strip()
2014-04-06 11:06:23 +01:00
2014-04-26 14:04:57 +01:00
def validTitle(title):
if title is '':
return False
if '\xc2' in title:
return False
if title.startswith("Letters"):
return False
if title.startswith("Comments"):
return False
if title.startswith("Glossary"):
return False
return True
# remove any invalid characters from an entry title
# so thst it can be saved in a filename
def jargonSaneTitle(title):
if '/' in title:
title = title.replace('/','-')
return title
2014-04-26 16:54:15 +01:00
# limit line lengths so that entries are more readable
def jargonPageColumns(text, columns):
words = text.split(' ')
lineLen = 0
line = ''
result = ''
for word in words:
if lineLen + len(word) + 1 >= columns:
result = result + line + '\n'
line = ''
lineLen = 0
lineLen = lineLen + len(word) + 1
if line == '':
line = word
else:
line = line + ' ' + word
return result + line + '\n'
2014-04-26 14:04:57 +01:00
def jargonCreateEntry(title, text, outputDir):
# create the filename for the entry
filename = outputDir
if not outputDir.endswith('/'):
filename = filename + '/'
2014-04-26 15:27:41 +01:00
filename = filename + jargonSaneTitle(title) + '.txt'
2014-04-26 14:04:57 +01:00
# don't overwrite existing files
if os.path.isfile(filename):
return ''
2014-04-26 16:54:15 +01:00
text = jargonPageColumns(text, 78)
2014-04-26 14:04:57 +01:00
fp = open(filename, 'w')
fp.write(title + '\n\n' + text + '\n')
fp.close
return filename
2014-04-26 15:11:49 +01:00
def jargonReadFile(filename, exclusions, outputDir):
2014-04-06 11:06:23 +01:00
inFile = open(filename)
buffer = ""
for line in inFile:
buffer = buffer + line
2014-04-06 13:23:46 +01:00
parser = JargonParser()
2014-04-06 11:06:23 +01:00
parser.feed(buffer)
2014-04-26 14:04:57 +01:00
if validTitle(parser.title) and \
2014-04-06 15:42:45 +01:00
parser.bodyText is not '' and \
len(parser.title) > 1:
2014-04-26 14:04:57 +01:00
saneBodyText = jargonSaneText(parser.title, parser.bodyText)
2014-04-26 15:27:41 +01:00
if not ((jargonSaneTitle(parser.title) in exclusions) or \
(parser.title in exclusions)):
2014-04-26 15:14:16 +01:00
entryFilename = jargonCreateEntry(parser.title, saneBodyText, outputDir)
if entryFilename is not '':
print entryFilename
2014-04-26 15:11:49 +01:00
# read original jargon file entries to be excluded
def jargonReadExclusions(filename):
if len(filename) == 0:
return []
if not os.path.isfile(filename):
return []
exclusions = []
with open(filename) as fp:
exclusions = fp.readlines()
fp.close()
2014-04-26 15:27:41 +01:00
tempExclusions = []
for i in range(len(exclusions)):
tempExclusions.append(exclusions[i].strip('\n'))
exclusions = tempExclusions
2014-04-26 15:11:49 +01:00
return exclusions
2014-04-06 11:06:23 +01:00
2014-04-26 14:04:57 +01:00
def jargonImport(rootDir, excludeEntriesFilename, outputDir):
2014-04-26 15:11:49 +01:00
exclusions = jargonReadExclusions(excludeEntriesFilename)
for dirName, subdirList, fileList in os.walk(rootDir):
for filename in fileList:
jargonReadFile(dirName + '/' + filename, exclusions, outputDir)
2014-04-06 11:06:23 +01:00
if __name__ == "__main__":
2014-04-26 15:27:41 +01:00
jargonImport('../original','exclusions.txt','../entries')