Skip to content

Instantly share code, notes, and snippets.

@hudsonsferreira
Forked from waldofe/eisparser.py
Created September 13, 2012 19:14
Show Gist options
  • Select an option

  • Save hudsonsferreira/3716868 to your computer and use it in GitHub Desktop.

Select an option

Save hudsonsferreira/3716868 to your computer and use it in GitHub Desktop.
A tool to parse eis pattern content from xml documents.
from os.path import join, basename, dirname
from nltk.util import clean_html
from nltk.corpus.reader import PlaintextCorpusReader
from os import system
import re
class EisParser(object):
def __init__(self, path):
self._name = basename(path)
self._path = dirname(path)
def _unzip_odt(self):
system('unzip %s/%s -d %s content.xml' %(self._path, self._name, self._path))
def legible_text(self):
self._unzip_odt()
self._raw_content_text = PlaintextCorpusReader(self._path, 'content.xml').raw()
self._cleaned_text = clean_html(self._raw_content_text)
self.content = re.sub(r'\w+ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} ', '', self._cleaned_text)
return self.content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment