import/importjargon.py

import os
import string
import HTMLParser, urllib, urlparse

class JargonFile(dict):
   def __init__(self,*arg,**kw):
      super(CustomDictOne, self).__init__(*arg, **kw)

   def __getitem__(self, key):
       val = dict.__getitem__(self, key)
       print 'GET', key
       return val

   def __setitem__(self, key, val):
       print 'SET', key, val
       dict.__setitem__(self, key, val)

   def __repr__(self):
       dictrepr = dict.__repr__(self)
       return '%s(%s)' % (type(self).__name__, dictrepr)

   def update(self, *args, **kwargs):
       print 'update', args, kwargs
       for k, v in dict(*args, **kwargs).iteritems():
           self[k] = v

class JargonParser(HTMLParser.HTMLParser):
    def __init__ (self):
        HTMLParser.HTMLParser.__init__ (self)
        self.seen = {}
        self.currentSection=''
        self.title = ''
        self.bodyText = ''
    def handle_data(self, data):
        if "head" in self.currentSection:
            # store the title
            self.title = data.strip()
            self.bodyText = '';
        elif "body" in self.currentSection:
            replacements = ['    ','   ','  ','\t','\r','\n']
            for rep in replacements:
                data = data.replace(rep,' ')
            data = data.strip()
            self.bodyText = self.bodyText + data + ' '
    def handle_starttag(self, tag, attributes):
        if "head" in tag or "body" in tag:
            self.currentSection = tag;

# Further sanitise the returned text
def jargonSaneText(title, text):
    if len(text) < 2:
        return ''

    # usually in the format (title : text)
    initsplit = text.split(' : ')
    if len(initsplit) < 2:
       # sometimes in the format (title[blurb] text)
       initsplit = text.split('] ')
       if len(initsplit) < 2:
          # sometimes in the format (title adj. text)
          initsplit = text.split(' adj. ')

    # is all else fails look for the second instance of the title text
    if len(initsplit) < 2:
       testsplit = text.split(title)
       if len(testsplit) >= 3:
          initsplit = testsplit
          initsplit[1] = ''
          testsplitctr = 0
          for txt in testsplit:
             if txt == ' ':
                txt = title
             if testsplitctr >= 2:
                if testsplitctr >= 3:
                   initsplit[1] = initsplit[1] + ' '
                initsplit[1] = initsplit[1] + txt
             testsplitctr = testsplitctr + 1

    if len(initsplit) < 2:
       return ''

    # get the second part of the split array (i.e. the description text)
    text = initsplit[1]

    sentsplit = text.split('.')
    if len(sentsplit) > 1:
        ctr = 0
        newtext = ''
        for sent in sentsplit:
            if ctr < len(sentsplit)-1:
                newtext = newtext + sent + '.'
            ctr = ctr + 1
        text = newtext

    text = text.replace(' . ','. ')
    text = text.replace(' .','. ')
    text = text.replace('  ',' ')
    text = filter(lambda x: x in string.printable, text)

    return text.strip()

def validTitle(title):
   if title is '':
      return False

   if '\xc2' in title:
      return False

   if title.startswith("Letters"):
      return False

   if title.startswith("Comments"):
      return False

   if title.startswith("Glossary"):
      return False

   return True

# remove any invalid characters from an entry title
# so thst it can be saved in a filename
def jargonSaneTitle(title):
   if '/' in title:
      title = title.replace('/','-')
   return title

# limit line lengths so that entries are more readable
def jargonPageColumns(text, columns):
   words = text.split(' ')
   lineLen = 0
   line = ''
   result = ''
   for word in words:
      if lineLen + len(word) + 1 >= columns:
         result = result + line + '\n'
         line = ''
         lineLen = 0
      lineLen = lineLen + len(word) + 1
      if line == '':
         line = word
      else:
         line = line + ' ' + word

   return result + line + '\n'

def jargonCreateEntry(title, text, outputDir):
   # create the filename for the entry
   filename = outputDir
   if not outputDir.endswith('/'):
      filename = filename + '/'
   filename = filename + jargonSaneTitle(title) + '.txt'

   # don't overwrite existing files
   if os.path.isfile(filename):
      return ''

   text = jargonPageColumns(text, 78)

   fp = open(filename, 'w')
   fp.write(title + '\n\n' + text + '\n')
   fp.close
   return filename

def jargonReadFile(filename, exclusions, outputDir):
    inFile = open(filename)
    buffer = ""
    for line in inFile:
        buffer = buffer + line
    parser = JargonParser()
    parser.feed(buffer)
    if validTitle(parser.title) and \
       parser.bodyText is not '' and \
       len(parser.title) > 1:
       saneBodyText = jargonSaneText(parser.title, parser.bodyText)
       if not ((jargonSaneTitle(parser.title) in exclusions) or \
               (parser.title in exclusions)):
          entryFilename = jargonCreateEntry(parser.title, saneBodyText, outputDir)
          if entryFilename is not '':
             print entryFilename

# read original jargon file entries to be excluded
def jargonReadExclusions(filename):
   if len(filename) == 0:
      return []

   if not os.path.isfile(filename):
      return []

   exclusions = []
   with open(filename) as fp:
      exclusions = fp.readlines()
   fp.close()

   tempExclusions = []
   for i in range(len(exclusions)):
      tempExclusions.append(exclusions[i].strip('\n'))
   exclusions = tempExclusions

   return exclusions

def jargonImport(rootDir, excludeEntriesFilename, outputDir):
   exclusions = jargonReadExclusions(excludeEntriesFilename)

   for dirName, subdirList, fileList in os.walk(rootDir):
      for filename in fileList:
         jargonReadFile(dirName + '/' + filename, exclusions, outputDir)

if __name__ == "__main__":
   jargonImport('../original','exclusions.txt','../entries')
Import html 2014-04-06 11:06:23 +01:00			`import os`
More parsing 2014-04-26 14:04:57 +01:00			`import string`
Beginning of jargon parser 2014-04-06 13:23:46 +01:00			`import HTMLParser, urllib, urlparse`

Beginning of dictionary 2014-04-06 16:16:39 +01:00			`class JargonFile(dict):`
			`def __init__(self,arg,*kw):`
			`super(CustomDictOne, self).__init__(arg, *kw)`

			`def __getitem__(self, key):`
			`val = dict.__getitem__(self, key)`
			`print 'GET', key`
			`return val`

			`def __setitem__(self, key, val):`
			`print 'SET', key, val`
			`dict.__setitem__(self, key, val)`

			`def __repr__(self):`
			`dictrepr = dict.__repr__(self)`
			`return '%s(%s)' % (type(self).__name__, dictrepr)`

			`def update(self, args, *kwargs):`
			`print 'update', args, kwargs`
			`for k, v in dict(args, *kwargs).iteritems():`
			`self[k] = v`

Beginning of jargon parser 2014-04-06 13:23:46 +01:00			`class JargonParser(HTMLParser.HTMLParser):`
			`def __init__ (self):`
			`HTMLParser.HTMLParser.__init__ (self)`
			`self.seen = {}`
			`self.currentSection=''`
			`self.title = ''`
More text sanitation 2014-04-06 15:42:45 +01:00			`self.bodyText = ''`
Beginning of jargon parser 2014-04-06 13:23:46 +01:00			`def handle_data(self, data):`
More text sanitation 2014-04-06 15:42:45 +01:00			`if "head" in self.currentSection:`
			`# store the title`
More parsing 2014-04-26 14:04:57 +01:00			`self.title = data.strip()`
More text sanitation 2014-04-06 15:42:45 +01:00			`self.bodyText = '';`
			`elif "body" in self.currentSection:`
			`replacements = [' ',' ',' ','\t','\r','\n']`
			`for rep in replacements:`
			`data = data.replace(rep,' ')`
			`data = data.strip()`
			`self.bodyText = self.bodyText + data + ' '`
Beginning of jargon parser 2014-04-06 13:23:46 +01:00			`def handle_starttag(self, tag, attributes):`
			`if "head" in tag or "body" in tag:`
			`self.currentSection = tag;`
More text sanitation 2014-04-06 15:42:45 +01:00
More parsing 2014-04-26 14:04:57 +01:00			`# Further sanitise the returned text`
			`def jargonSaneText(title, text):`
More text sanitation 2014-04-06 15:42:45 +01:00			`if len(text) < 2:`
			`return ''`

More parsing 2014-04-26 14:04:57 +01:00			`# usually in the format (title : text)`
More text sanitation 2014-04-06 15:42:45 +01:00			`initsplit = text.split(' : ')`
			`if len(initsplit) < 2:`
More parsing 2014-04-26 14:04:57 +01:00			`# sometimes in the format (title[blurb] text)`
			`initsplit = text.split('] ')`
			`if len(initsplit) < 2:`
			`# sometimes in the format (title adj. text)`
			`initsplit = text.split(' adj. ')`

			`# is all else fails look for the second instance of the title text`
			`if len(initsplit) < 2:`
			`testsplit = text.split(title)`
			`if len(testsplit) >= 3:`
			`initsplit = testsplit`
			`initsplit[1] = ''`
			`testsplitctr = 0`
			`for txt in testsplit:`
			`if txt == ' ':`
			`txt = title`
			`if testsplitctr >= 2:`
			`if testsplitctr >= 3:`
			`initsplit[1] = initsplit[1] + ' '`
			`initsplit[1] = initsplit[1] + txt`
			`testsplitctr = testsplitctr + 1`

			`if len(initsplit) < 2:`
			`return ''`
More text sanitation 2014-04-06 15:42:45 +01:00
More parsing 2014-04-26 14:04:57 +01:00			`# get the second part of the split array (i.e. the description text)`
			`text = initsplit[1]`
More text sanitation 2014-04-06 15:42:45 +01:00
			`sentsplit = text.split('.')`
			`if len(sentsplit) > 1:`
			`ctr = 0`
			`newtext = ''`
			`for sent in sentsplit:`
			`if ctr < len(sentsplit)-1:`
			`newtext = newtext + sent + '.'`
			`ctr = ctr + 1`
			`text = newtext`

Beginning of dictionary 2014-04-06 16:16:39 +01:00			`text = text.replace(' . ','. ')`
			`text = text.replace(' .','. ')`
			`text = text.replace(' ',' ')`
More parsing 2014-04-26 14:04:57 +01:00			`text = filter(lambda x: x in string.printable, text)`
Beginning of dictionary 2014-04-06 16:16:39 +01:00
			`return text.strip()`
Import html 2014-04-06 11:06:23 +01:00
More parsing 2014-04-26 14:04:57 +01:00			`def validTitle(title):`
			`if title is '':`
			`return False`

			`if '\xc2' in title:`
			`return False`

			`if title.startswith("Letters"):`
			`return False`

			`if title.startswith("Comments"):`
			`return False`

			`if title.startswith("Glossary"):`
			`return False`

			`return True`

			`# remove any invalid characters from an entry title`
			`# so thst it can be saved in a filename`
			`def jargonSaneTitle(title):`
			`if '/' in title:`
			`title = title.replace('/','-')`
			`return title`

Limit line lengths 2014-04-26 16:54:15 +01:00			`# limit line lengths so that entries are more readable`
			`def jargonPageColumns(text, columns):`
			`words = text.split(' ')`
			`lineLen = 0`
			`line = ''`
			`result = ''`
			`for word in words:`
			`if lineLen + len(word) + 1 >= columns:`
			`result = result + line + '\n'`
			`line = ''`
			`lineLen = 0`
			`lineLen = lineLen + len(word) + 1`
			`if line == '':`
			`line = word`
			`else:`
			`line = line + ' ' + word`

			`return result + line + '\n'`

More parsing 2014-04-26 14:04:57 +01:00			`def jargonCreateEntry(title, text, outputDir):`
			`# create the filename for the entry`
			`filename = outputDir`
			`if not outputDir.endswith('/'):`
			`filename = filename + '/'`
Fix exclusions 2014-04-26 15:27:41 +01:00			`filename = filename + jargonSaneTitle(title) + '.txt'`
More parsing 2014-04-26 14:04:57 +01:00
			`# don't overwrite existing files`
			`if os.path.isfile(filename):`
			`return ''`

Limit line lengths 2014-04-26 16:54:15 +01:00			`text = jargonPageColumns(text, 78)`

More parsing 2014-04-26 14:04:57 +01:00			`fp = open(filename, 'w')`
			`fp.write(title + '\n\n' + text + '\n')`
			`fp.close`
			`return filename`

Allow an exclusions file 2014-04-26 15:11:49 +01:00			`def jargonReadFile(filename, exclusions, outputDir):`
Import html 2014-04-06 11:06:23 +01:00			`inFile = open(filename)`
			`buffer = ""`
			`for line in inFile:`
			`buffer = buffer + line`
Beginning of jargon parser 2014-04-06 13:23:46 +01:00			`parser = JargonParser()`
Import html 2014-04-06 11:06:23 +01:00			`parser.feed(buffer)`
More parsing 2014-04-26 14:04:57 +01:00			`if validTitle(parser.title) and \`
More text sanitation 2014-04-06 15:42:45 +01:00			`parser.bodyText is not '' and \`
			`len(parser.title) > 1:`
More parsing 2014-04-26 14:04:57 +01:00			`saneBodyText = jargonSaneText(parser.title, parser.bodyText)`
Fix exclusions 2014-04-26 15:27:41 +01:00			`if not ((jargonSaneTitle(parser.title) in exclusions) or \`
			`(parser.title in exclusions)):`
Only display filename if imported 2014-04-26 15:14:16 +01:00			`entryFilename = jargonCreateEntry(parser.title, saneBodyText, outputDir)`
			`if entryFilename is not '':`
			`print entryFilename`
Allow an exclusions file 2014-04-26 15:11:49 +01:00
			`# read original jargon file entries to be excluded`
			`def jargonReadExclusions(filename):`
			`if len(filename) == 0:`
			`return []`

			`if not os.path.isfile(filename):`
			`return []`

			`exclusions = []`
			`with open(filename) as fp:`
			`exclusions = fp.readlines()`
			`fp.close()`

Fix exclusions 2014-04-26 15:27:41 +01:00			`tempExclusions = []`
			`for i in range(len(exclusions)):`
			`tempExclusions.append(exclusions[i].strip('\n'))`
			`exclusions = tempExclusions`

Allow an exclusions file 2014-04-26 15:11:49 +01:00			`return exclusions`
Import html 2014-04-06 11:06:23 +01:00
More parsing 2014-04-26 14:04:57 +01:00			`def jargonImport(rootDir, excludeEntriesFilename, outputDir):`
Allow an exclusions file 2014-04-26 15:11:49 +01:00			`exclusions = jargonReadExclusions(excludeEntriesFilename)`

			`for dirName, subdirList, fileList in os.walk(rootDir):`
			`for filename in fileList:`
			`jargonReadFile(dirName + '/' + filename, exclusions, outputDir)`
Import html 2014-04-06 11:06:23 +01:00
			`if __name__ == "__main__":`
Fix exclusions 2014-04-26 15:27:41 +01:00			`jargonImport('../original','exclusions.txt','../entries')`