-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_gervetetal.py
More file actions
57 lines (42 loc) · 1.77 KB
/
process_gervetetal.py
File metadata and controls
57 lines (42 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import pandas as pd
import glob
import split_dataset
import os
from collections import defaultdict
os.makedirs('./data/datasets', exist_ok=True)
os.makedirs('./data/splits', exist_ok=True)
datasets = glob.glob("../learner-performance-prediction/data/*")
for dataset in datasets:
path = "%s/preprocessed_data.csv" % dataset
df = pd.read_csv(path, sep='\t')
if 'Unnamed: 0' in df.columns:
df = df.set_index('Unnamed: 0')
print(df.columns)
dataset_name = os.path.basename(dataset)
print(dataset_name)
df.columns = ['student', 'problem', 'timestamp', 'correct', 'skill']
df.to_csv("data/datasets/gervetetal_%s.csv" % dataset_name, index=False)
full_splits = split_dataset.main(df)
np.save("data/splits/gervetetal_%s.npy" % dataset_name, full_splits)
datasets = glob.glob("../learner-performance-prediction/data/*")
for dataset in datasets:
dataset_name = os.path.basename(dataset)
print(dataset_name)
df = pd.read_csv("data/datasets/gervetetal_%s.csv" % dataset_name)
splits = np.load("data/splits/gervetetal_%s.npy" % dataset_name)
train_ix = splits[0, :] == 2
valid_ix = splits[0, :] == 1
test_ix = splits[0, :] == 0
train_df = df[train_ix]
test_df = df[test_ix]
train_items = set(train_df['problem'])
test_items = set(test_df['problem'])
print("Items in testing not in training: %d (total items: %d)" %
(len(test_items - train_items), len(set(df['problem']))))
items_to_kc = defaultdict(set)
for item, skill in zip(df['problem'], df['skill']):
items_to_kc[item].add(skill)
for item, kcs in items_to_kc.items():
if len(kcs) > 1:
print("Warning: Item %d has more than one KC" % item)