-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_transcript.py
More file actions
executable file
·84 lines (68 loc) · 2.48 KB
/
parse_transcript.py
File metadata and controls
executable file
·84 lines (68 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
"""
File to parse the transcript file for specified character's lines in preparation for generating data to feed to the RNN models.
"""
import os
import re
import unicodedata
import codecs
from io import open
import itertools
import csv
#Function to extract sentence/response pairs for a specified character
def loadLinePairs(filename, character, fields, character_lines):
lines = []
linePairs = []
#Read all lines from transcript file
with open(filename, 'r') as f:
for line1 in f:
if ":" in line1 == False:
continue
values1 = line1.split(":")
lineObj1 = {}
if len(values1) < 2:
continue
for i, field in enumerate(fields):
lineObj1[field] = values1[i]
lines.append(lineObj1)
#Parse specified character responses from file and format
for i in range(len(lines) - 1):
firstLine = lines[i]
firstChar = firstLine["characterID"].strip()
firstText = firstLine["text"].strip()
secondLine = lines[i+1]
secondChar = secondLine["characterID"].strip()
secondText = secondLine["text"].strip()
#Adding all character dialog lines to aid in training
#Remove this line to only train with specified character's lines
firstText = re.sub("\n", "", firstText)
firstText = re.sub("[\[].*?[\]]", "", firstText).strip()
secondText = re.sub("[\[].*?[\]]", "", secondText).strip()
linePairs.append([firstText, secondText])
#Append Squidward (or specified character) lines
if secondChar == character or secondChar == "Squidward Tentacles":
firstText = re.sub("\n", "", firstText)
firstText = re.sub("[\[].*?[\]]", "", firstText).strip()
secondText = re.sub("[\[].*?[\]]", "", secondText).strip()
linePairs.append([firstText, secondText])
delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
#rite sentence response pairs to csv file
with open(character_lines, 'w') as f:
writer = csv.writer(f, delimiter=delimiter, lineterminator='\n')
for pair in linePairs:
writer.writerow(pair)
def printLines(file, n=10):
with open(file, 'rb') as datafile:
lines = datafile.readlines()
for line in lines[:n]:
print(line)
#Main function to call the other functions and save sentence-response pairs in squid_linePairs.txt file
def main():
FIELDS = ["characterID", "text"]
filename = os.path.join("compiled_transcripts.txt")
loadLinePairs(filename, "Squidward", FIELDS, "squid_linePairs.txt")
printLines("squid_linePairs.txt")
if __name__ == "__main__":
main()