bulbil · July 28, 2021 18:36
diff --git a/process.py b/process.py
 # -*- coding: utf-8 -*-

 """
 - processes Panopto automated text transcription output .txt files to timestamped html
 - for Omeka theme-foundation-ecr (@swat-ds) designed for Swarthmore Civil Rights 1960-1966 oral history project site

 How to use?

 1. Copy script inside target folder and navigate into target folder
 2. Ensure script has execute permissions
 3. Adjust target file suffix if needed (line 42)
 4. Execute `python process.py` 

 Example source Panopto file format:

 `
 10
 00:00:57,200 --> 00:01:04,070
 Do you remember that event? Oh, yes, yes. I drove down there with maybe four other people.

 11
 00:01:04,070 --> 00:01:09,980
 They worked from Swarthmore. I was sort of the representative from from spek.
 `

 Example Omeka theme compatible HTML fragment output:

 `
 <p data-timestart="2409450" data-timestop="2415600">Well, thank you so much. This is this has been great. Thank you for your time and for sharing your memories with us.</p>
 <p data-timestart="2415600" data-timestop="2420610">This is really well. And I enjoyed talking to you. And thanks for asking good questions.</p>
 <p data-timestart="2420610" data-timestop="2424120">Thank you, Tom. Thank you very much again for your time. Okay. Thank you.</p>
 `

 """

 import os, re

 for file in os.listdir():

    if file[-8:] == "docx.txt":
      indexes = []
      sections = []
      lines = []
      with open(file,'r') as f:
        text = f.readlines()
        for l in text:
          lines.append(l.strip())
        i = 0
        for line in lines:
          if re.match('^[\d]{1,3}$',line):
            indexes.append(i)
          i += 1

      i = 0
      for index in indexes:

        curr_section = {}
        timestamp = re.split('[:, ]', lines[index + 2].replace('.',''))

        def getMilliseconds(timestamp):
          timestamp = int(timestamp[0]) * 60 * 60 * 1000 + \
                      int(timestamp[1]) * 60 * 1000 + \
                      int(timestamp[2]) * 1000 + \
                      int(timestamp[3])
          return timestamp

        curr_section['timestart'] = getMilliseconds(timestamp[0:4])
        curr_section['timestop'] = getMilliseconds(timestamp[5:])

        if (i < len(indexes) - 1):
          max = indexes[i + 1]    
        else:
          max = len(lines)

        curr_section['text'] = ''
        for j in range( index+3, max  ): curr_section['text'] += lines[j].replace('\\',' ')

        sections.append(curr_section)
        i+=1

      with open(file[:-9] + '.html','w') as f:
        out = []
        for section in sections:
          out += f"<p data-timestart={section['timestart']} data-timestop={section['timestop']}>{section['text']}</p>\n"
        f.writelines(out)
	# -- coding: utf-8 --

	"""
	- processes Panopto automated text transcription output .txt files to timestamped html
	- for Omeka theme-foundation-ecr (@swat-ds) designed for Swarthmore Civil Rights 1960-1966 oral history project site

	How to use?

	1. Copy script inside target folder and navigate into target folder
	2. Ensure script has execute permissions
	3. Adjust target file suffix if needed (line 42)
	4. Execute `python process.py`

	Example source Panopto file format:

	`
	10
	00:00:57,200 --> 00:01:04,070
	Do you remember that event? Oh, yes, yes. I drove down there with maybe four other people.

	11
	00:01:04,070 --> 00:01:09,980
	They worked from Swarthmore. I was sort of the representative from from spek.
	`

	Example Omeka theme compatible HTML fragment output:

	`
	<p data-timestart="2409450" data-timestop="2415600">Well, thank you so much. This is this has been great. Thank you for your time and for sharing your memories with us.</p>
	<p data-timestart="2415600" data-timestop="2420610">This is really well. And I enjoyed talking to you. And thanks for asking good questions.</p>
	<p data-timestart="2420610" data-timestop="2424120">Thank you, Tom. Thank you very much again for your time. Okay. Thank you.</p>
	`

	"""

	import os, re

	for file in os.listdir():

	if file[-8:] == "docx.txt":
	indexes = []
	sections = []
	lines = []
	with open(file,'r') as f:
	text = f.readlines()
	for l in text:
	lines.append(l.strip())
	i = 0
	for line in lines:
	if re.match('^[\d]{1,3}$',line):
	indexes.append(i)
	i += 1

	i = 0
	for index in indexes:

	curr_section = {}
	timestamp = re.split('[:, ]', lines[index + 2].replace('.',''))

	def getMilliseconds(timestamp):
	timestamp = int(timestamp[0]) * 60 * 60 * 1000 + \
	int(timestamp[1]) * 60 * 1000 + \
	int(timestamp[2]) * 1000 + \
	int(timestamp[3])
	return timestamp

	curr_section['timestart'] = getMilliseconds(timestamp[0:4])
	curr_section['timestop'] = getMilliseconds(timestamp[5:])

	if (i < len(indexes) - 1):
	max = indexes[i + 1]
	else:
	max = len(lines)

	curr_section['text'] = ''
	for j in range( index+3, max ): curr_section['text'] += lines[j].replace('\\',' ')

	sections.append(curr_section)
	i+=1

	with open(file[:-9] + '.html','w') as f:
	out = []
	for section in sections:
	out += f"<p data-timestart={section['timestart']} data-timestop={section['timestop']}>{section['text']}</p>\n"
	f.writelines(out)
No results found