-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit.py
More file actions
29 lines (21 loc) · 937 Bytes
/
split.py
File metadata and controls
29 lines (21 loc) · 937 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
from sklearn.model_selection import StratifiedKFold
# Load your train.csv dataset
data = pd.read_csv('data/train.csv')
# Define the number of folds
n_splits = 5
# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# Create a directory to save the split datasets
import os
os.makedirs('splits', exist_ok=True)
# Create indices for the folds
fold_indices = list(skf.split(data['text'], data['sentiment']))
# Iterate through the folds
for fold, (train_idx, val_idx) in enumerate(fold_indices):
train_data = data.iloc[train_idx]
val_data = data.iloc[val_idx]
# Save the train and validation datasets
train_data.to_csv(f'splits/train_fold_{fold}.csv', index=False)
val_data.to_csv(f'splits/validation_fold_{fold}.csv', index=False)
print(f"Fold {fold + 1}: Train samples = {len(train_data)}, Validation samples = {len(val_data)}")