-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathdata_preprocessor.py
More file actions
94 lines (82 loc) · 3.31 KB
/
data_preprocessor.py
File metadata and controls
94 lines (82 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
""" Preprocessor for DuIE and SciERC!
"""
from easonsi.util.leetcode import *
from easonsi import utils
from datasets import Dataset
import pandas as pd
import os, sys, json
class Processor:
@staticmethod
def f_process_duie_sample(sample):
spo_list_new = []
for spo in sample['spo_list']:
spo_list_new.append({
"subject": spo['subject'],
"predicate": spo['predicate'],
"object": spo['object']['@value']
})
return {'spo_list': spo_list_new}
@staticmethod
def f_process_duie_schema(schema) -> None:
schema['object_type'] = schema['object_type']['@value']
return schema
def process_duie(self):
ddir = "/home/ubuntu/work/agent/AgentIE/data/DuIE2.0"
for fn in ['duie_sample.json', 'duie_dev.json']:
ofn = f"{ddir}/std_{fn}"
if os.path.exists(ofn):
continue
# ds = Dataset.from_json(f"{ddir}/{fname}") # has bug!!!
df = pd.read_json(f"{ddir}/{fn}", lines=True)
ds = Dataset.from_pandas(df)
ds_processed = ds.map(self.f_process_duie_sample)
ds_processed.to_json(ofn, orient="records", lines=True, force_ascii=False)
print(f"Saved to {ofn}")
fn_schema = f"duie_schema.json"
ds_schema = Dataset.from_json(f"{ddir}/{fn_schema}")
ds_schema_processed = ds_schema.map(self.f_process_duie_schema)
ds_schema_processed.to_json(f"{ddir}/std_{fn_schema}", orient="records", lines=True, force_ascii=False)
print(f"Saved to {ddir}/std_{fn_schema}")
@staticmethod
def f_process_scierc_sample(sample):
spo_list_new = []
for spo in sample['spo_list']:
spo_list_new.append({
"subject": spo['head']['name'],
"predicate": spo['type'],
"object": spo['tail']['name']
})
return {'spo_list': spo_list_new}
@staticmethod
def f_process_scierc_schema(schema) -> None:
schema_new = {
"object_type": "Any",
"predicate": schema['predicate'],
"subject_type": "Any"
}
return schema_new
def process_sciERC(self):
ddir = "/home/ubuntu/work/agent/AgentIE/data/SciERC_sample_10000"
for fn in ['test.json', 'train.json']:
ofn = f"{ddir}/std_{fn}"
if os.path.exists(ofn):
continue
d_list = utils.LoadJson(f"{ddir}/{fn}")
ds = Dataset.from_dict({
"text": [d['sentence'] for d in d_list],
"spo_list": [d['relations'] for d in d_list]
})
ds_processed = ds.map(self.f_process_scierc_sample)
ds_processed.to_json(ofn, orient="records", lines=True, force_ascii=False)
print(f"Saved to {ofn}")
schema_name_list = utils.LoadJson(f"{ddir}/labels.json")
ds_schema = Dataset.from_dict({
"predicate": schema_name_list
})
ds_schema_processed = ds_schema.map(self.f_process_scierc_schema)
ds_schema_processed.to_json(f"{ddir}/std_schema.json", orient="records", lines=True, force_ascii=False)
print(f"Saved to {ddir}/std_schema.json")
processor = Processor()
# processor.process_duie()
processor.process_sciERC()
print("Done!")