-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_clone_preprocessing.py
More file actions
44 lines (38 loc) · 1.47 KB
/
data_clone_preprocessing.py
File metadata and controls
44 lines (38 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import json
import pandas as pd
import git
git.Git("data/origin_data/").clone("https://github.com/google/BEGIN-dataset.git")
git.Git("data/origin_data/").clone("https://github.com/RUCAIBox/HaluEval.git")
begin_dev_paths = [
"data/origin_data/BEGIN-dataset/topicalchat/begin_dev_tc.tsv",
"data/origin_data/BEGIN-dataset/cmu-dog/begin_dev_cmu.tsv",
"data/origin_data/BEGIN-dataset/wow/begin_dev_wow.tsv"
]
begin_test_paths = [
"data/origin_data/BEGIN-dataset/topicalchat/begin_test_tc.tsv",
"data/origin_data/BEGIN-dataset/cmu-dog/begin_test_cmu.tsv",
"data/origin_data/BEGIN-dataset/wow/begin_test_wow.tsv"
]
def load_begin_csv(paths):
data = []
for path in paths:
print(path)
df = pd.read_csv(path, sep='\t', header=0).fillna('')
for i in range(len(df)):
knowledge = df.iloc[i]["knowledge"]
message = df.iloc[i]["message"]
response = df.iloc[i]["response"]
label = df.iloc[i]["begin_label"]
d = {
"input": message,
"system_output": response,
"grounding": knowledge,
"label": label,
"index": len(data)
}
data.append(d)
return data
begin_dev = load_begin_csv(begin_dev_paths)
begin_test = load_begin_csv(begin_test_paths)
open("data/begin_dev.json", "w").write(json.dumps(begin_dev, indent="\t"))
open("data/begin_test.json", "w").write(json.dumps(begin_test, indent="\t"))