DecodingViewerEmotions/setup_data.py at main · gmontana/DecodingViewerEmotions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
"""
setup_data.py

Prepares data downloaded from HuggingFace for use with the TSAM codebase.

Usage:
    python setup_data.py --input ./adcumen-data

This script:
    1. Unzips 5-second_MP4_Clips.zip into a videos/ directory
    2. Extracts frames from each MP4 at 10fps into data/frames_fps_10/{clip_id}/
    3. Extracts audio from each MP4 into data/audios/{clip_id}.wav
    4. Generates split files (training_0, valid_0_p1, valid_0_p2) from CSVs
    5. Builds VDB.pickle from CSV labels
    6. Extracts backbone_weights.tar -> net_weigths/resnet50_miil_21k.pth
    7. Extracts tsam_weights.tar -> weights/
"""

import argparse
import csv
import os
import subprocess
import sys
import tarfile
import zipfile
import multiprocessing
from collections import Counter
from pathlib import Path


def extract_clips(input_dir, videos_dir):
    """Unzip 5-second_MP4_Clips.zip into videos_dir (skips if already extracted)."""
    zip_path = os.path.join(input_dir, "5-second_MP4_Clips.zip")
    if not os.path.exists(zip_path):
        print(f"ERROR: {zip_path} not found")
        sys.exit(1)

    # Check if clips are already extracted
    existing = find_mp4_files(videos_dir) if os.path.isdir(videos_dir) else {}
    if len(existing) > 1000:
        print(f"Clips already extracted ({len(existing)} found), skipping.")
        return

    print(f"Extracting clips from {zip_path}...")
    os.makedirs(videos_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(videos_dir)
    print(f"Extracted clips to {videos_dir}")


def find_mp4_files(videos_dir):
    """Find all MP4 files in videos_dir (handles nested directories from zip)."""
    mp4_files = {}
    for root, dirs, files in os.walk(videos_dir):
        for f in files:
            if f.endswith(".mp4"):
                clip_name = f.replace(".mp4", "")
                mp4_files[clip_name] = os.path.join(root, f)
    return mp4_files


def extract_frames_worker(args):
    """Worker function for parallel frame extraction."""
    clip_name, mp4_path, frames_dir, fps = args
    out_dir = os.path.join(frames_dir, clip_name)
    os.makedirs(out_dir, exist_ok=True)

    cmd = [
        "ffmpeg", "-loglevel", "panic",
        "-i", mp4_path,
        "-vf", f"scale=-1:256,fps={fps}",
        "-q:v", "0",
        os.path.join(out_dir, "%06d.jpg")
    ]
    try:
        subprocess.run(cmd, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"ERROR extracting frames for {clip_name}: {e}")
        return clip_name
    return None


def extract_audio_worker(args):
    """Worker function for parallel audio extraction."""
    clip_name, mp4_path, audios_dir = args
    wav_path = os.path.join(audios_dir, f"{clip_name}.wav")

    cmd = [
        "ffmpeg", "-loglevel", "panic",
        "-i", mp4_path,
        "-y", wav_path
    ]
    try:
        subprocess.run(cmd, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"ERROR extracting audio for {clip_name}: {e}")
        return clip_name
    return None


def extract_frames(mp4_files, frames_dir, fps=10, workers=4):
    """Extract frames from all MP4 files at given fps."""
    os.makedirs(frames_dir, exist_ok=True)

    # Skip clips that already have frames
    tasks = []
    for clip_name, mp4_path in mp4_files.items():
        out_dir = os.path.join(frames_dir, clip_name)
        if os.path.isdir(out_dir) and len(os.listdir(out_dir)) > 0:
            continue
        tasks.append((clip_name, mp4_path, frames_dir, fps))

    if not tasks:
        print("All frames already extracted, skipping.")
        return []

    print(f"Extracting frames from {len(tasks)} clips using {workers} workers...")
    with multiprocessing.Pool(workers) as pool:
        errors = pool.map(extract_frames_worker, tasks)
    errors = [e for e in errors if e is not None]
    print(f"Frame extraction complete. Errors: {len(errors)}")
    return errors


def extract_audio(mp4_files, audios_dir, workers=4):
    """Extract audio from all MP4 files."""
    os.makedirs(audios_dir, exist_ok=True)

    tasks = []
    for clip_name, mp4_path in mp4_files.items():
        wav_path = os.path.join(audios_dir, f"{clip_name}.wav")
        if os.path.isfile(wav_path):
            continue
        tasks.append((clip_name, mp4_path, audios_dir))

    if not tasks:
        print("All audio already extracted, skipping.")
        return []

    print(f"Extracting audio from {len(tasks)} clips using {workers} workers...")
    with multiprocessing.Pool(workers) as pool:
        errors = pool.map(extract_audio_worker, tasks)
    errors = [e for e in errors if e is not None]
    print(f"Audio extraction complete. Errors: {len(errors)}")
    return errors


def read_csv(csv_path):
    """Read a HuggingFace CSV file and return list of dicts."""
    rows = []
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)
    return rows


def generate_split_files(input_dir, output_dir):
    """Generate split files from HuggingFace CSVs.

    Each clip has a unique Clips_Name (e.g., 6mWfuH7p, 6mWfuH7p_2).
    Split files contain one Clips_Name per line.
    Returns dict mapping split_name -> list of CSV row dicts.
    """
    os.makedirs(output_dir, exist_ok=True)

    splits = {}
    csv_map = {
        "training.csv": ("training_0", "training"),
        "validation.csv": ("valid_0_p2", "validation"),
        "testing.csv": ("valid_0_p1", "test"),
    }

    for csv_name, (split_file, split_key) in csv_map.items():
        csv_path = os.path.join(input_dir, csv_name)
        if not os.path.exists(csv_path):
            print(f"WARNING: {csv_path} not found, skipping {split_key} split")
            continue

        rows = read_csv(csv_path)
        splits[split_key] = rows

        # Write split file: one Clips_Name per line
        split_path = os.path.join(output_dir, split_file)
        with open(split_path, 'w') as f:
            for row in rows:
                clip_name = row["Clips_Name"].replace(".mp4", "")
                f.write(clip_name + "\n")

        print(f"Generated {split_path} with {len(rows)} clips")

    return splits


def build_vdb(splits, output_path, clip_length=5):
    """Build VDB.pickle from CSV data.

    Each 5-second clip is treated as its own video entry with t=0.
    Clips_Name is the video ID, frames are at frames_fps_10/{Clips_Name}/.

    CSV Label column: 0-7 numeric -> eid 1-8.
    """
    from mvlib.mvideo_lib import VideoDB, Video
    from mvlib.utils import save_pickle

    emotion_list = ["Anger", "Contempt", "Disgust", "Fear",
                    "Happiness", "Neutral", "Sadness", "Surprise"]

    # Collect all clips
    all_clips = {}
    for split_name, rows in splits.items():
        for row in rows:
            clip_name = row["Clips_Name"].replace(".mp4", "")
            label = int(row["Label"])
            eid = label + 1  # CSV 0-7 -> code 1-8
            all_clips[clip_name] = eid

    print(f"Building VDB from {len(all_clips)} clips...")

    # Create VDB object
    vdb = VideoDB.__new__(VideoDB)
    vdb.fileDescriptionVideos = ""
    vdb.fileIndividualProfiles = ""
    vdb.dirVideos = None
    vdb.positive_ID_filtr = {}

    # Each clip is its own "video" with Duration=clip_length
    vdb.VDB = {}
    vdb.VMAP = {}
    for i, clip_name in enumerate(all_clips):
        v = Video.__new__(Video)
        v.VID = clip_name
        v.AID = i
        v.Duration = clip_length
        v.StarRating = 0.0
        v.MarketId = "826"
        v.Title = ""
        vdb.VDB[clip_name] = v
        vdb.VMAP[i] = clip_name

    # Build APTV: for each clip, set labeled emotion to jump at t=0->t=clip_length
    vdb.APTV = {}
    vdb.IndividualProfiles = {}
    for clip_name, eid in all_clips.items():
        vdb.APTV[clip_name] = {}
        for e in range(1, 9):
            vdb.APTV[clip_name][e] = Counter()
            for t in range(clip_length + 1):
                vdb.APTV[clip_name][e][t] = 0
        # Labeled emotion: jump of 1.0 at t=0
        vdb.APTV[clip_name][eid][clip_length] = 1.0
        vdb.IndividualProfiles[clip_name] = []

    vdb.add_Emotions()
    save_pickle(output_path, vdb)
    print(f"Saved VDB.pickle to {output_path}")

    # Print label distribution
    label_counts = Counter(all_clips.values())
    print("Label distribution:")
    for i, name in enumerate(emotion_list):
        eid = i + 1
        print(f"  {name} (eid={eid}): {label_counts.get(eid, 0)}")


def extract_weights(input_dir, repo_root):
    """Place backbone and TSAM weights at the expected paths.

    The HuggingFace .tar files are actually PyTorch weight files
    (torch.save uses zip format internally), not archives to extract.
    """
    import shutil

    # Backbone weights: copy directly as .pth file
    backbone_src = os.path.join(input_dir, "backbone_weights.tar")
    backbone_dir = os.path.join(repo_root, "net_weigths")
    expected_backbone = os.path.join(backbone_dir, "resnet50_miil_21k.pth")

    if os.path.exists(backbone_src):
        os.makedirs(backbone_dir, exist_ok=True)
        shutil.copy2(backbone_src, expected_backbone)
        print(f"Backbone weights -> {expected_backbone}")
    else:
        print(f"WARNING: {backbone_src} not found")

    # TSAM weights: copy directly as checkpoint file
    tsam_src = os.path.join(input_dir, "tsam_weights.tar")
    weights_dir = os.path.join(repo_root, "weights")
    checkpoint_dir = os.path.join(weights_dir, "checkpoint")
    expected_ckpt = os.path.join(checkpoint_dir, "balanced.ckpt.pth.tar")
    args_json = os.path.join(weights_dir, "args.json")

    if os.path.exists(tsam_src):
        os.makedirs(checkpoint_dir, exist_ok=True)
        shutil.copy2(tsam_src, expected_ckpt)
        print(f"TSAM checkpoint  -> {expected_ckpt}")

        # Generate args.json if not present
        if not os.path.exists(args_json):
            _generate_default_args_json(args_json)
            print(f"Model config     -> {args_json}")
    else:
        print(f"WARNING: {tsam_src} not found")


def _generate_default_args_json(args_json):
    """Generate default args.json for the TSAM model."""
    import json
    default_args = {
        "emotion_jumps": {
            "emotion_ids": [1, 2, 3, 4, 5, 6, 7, 8],
            "clip_length": 5, "jump": 0.5, "porog": 0.2, "background_size": -1
        },
        "dataset": {
            "name": "adcumen", "data_dir": "./data",
            "dir_videos": "videos", "dir_frames": "frames_fps_10", "dir_audios": "audios",
            "fileVDB": "DataAdcumen/VDB.pickle",
            "file_train_list": "DataAdcumen/training_0",
            "file_val_list": "DataAdcumen/valid_0_p2",
            "file_test_list": "DataAdcumen/valid_0_p1",
            "video_img_param": {"image_tmpl": "{:06d}.jpg", "img_input_size": 256, "img_output_size": 224},
            "video_augmentation": {"RandomHorizontalFlip": True, "scales": [1, 0.875, 0.75, 0.66],
                                   "Adjust_sharpness": 2.0, "ColorJitter": False, "RandomGrayscale": 0.0,
                                   "GaussianBlur": False},
            "audio_img_param": {"window_sizes": [25, 50, 100], "hop_sizes": [10, 25, 50],
                                "n_mels": 224, "eps": 1e-6, "spec_size": [3, 224, 224],
                                "num_segments": 1, "m_segments": 1},
            "audio_augmentation": {"status": True, "random_shift_waveform": [0.1, 0.1]},
            "fps": 10
        },
        "TSM": {
            "video_segments": 12, "audio_segments": 1, "motion": False, "num_class": 8,
            "main": {"arch": "resnet50_timm", "pretrain": "imagenet", "dropout": 0.5,
                     "last_pool": 1, "input_mode": 2,
                     "backbone_weights": "net_weigths/resnet50_miil_21k.pth"},
            "shift_temporal": {"status": True, "f_div": 8, "shift_depth": 1, "n_insert": 2, "m_insert": 0},
            "shift_temporal_modality": {"status": False, "f_div": 8, "n_insert": 2, "m_insert": 1},
            "shift_spatial": {"status": False, "f_div": 8, "n_insert": 2, "m_insert": 1},
            "motion_param": {"k_frames": 5, "sharpen_cycles": 1, "HW_conv_kernel": 9,
                             "HW_conv_sigma": 1.1, "normadd": 0}
        },
        "net_run_param": {"epochs": 10, "batch_size": 8, "num_workers": 4},
        "net_optim_param": {"lr": 0.1, "lr_decay": [0.1, 3, 0.01, 6, 0.001, 8],
                            "momentum": 0.9, "gd": 20, "weight_decay": 1e-4},
        "save_epoch": [], "root_folder": "logs"
    }
    with open(args_json, 'w') as f:
        json.dump(default_args, f, indent=4)
    print(f"  Generated {args_json}")


def main():
    parser = argparse.ArgumentParser(
        description="Prepare HuggingFace data for use with TSAM codebase")
    parser.add_argument("--input", type=str, required=True,
                        help="Path to HuggingFace download directory")
    parser.add_argument("--data-dir", type=str, default="./data",
                        help="Output directory for processed data (default: ./data)")
    parser.add_argument("--workers", type=int, default=4,
                        help="Number of parallel workers for frame/audio extraction")
    parser.add_argument("--fps", type=int, default=10,
                        help="Frames per second for frame extraction (default: 10)")
    parser.add_argument("--skip-frames", action="store_true",
                        help="Skip frame extraction")
    parser.add_argument("--skip-audio", action="store_true",
                        help="Skip audio extraction")
    args = parser.parse_args()

    input_dir = os.path.abspath(args.input)
    data_dir = os.path.abspath(args.data_dir)
    repo_root = os.path.dirname(os.path.abspath(__file__))

    print(f"Input directory: {input_dir}")
    print(f"Data directory: {data_dir}")
    print(f"Repository root: {repo_root}")
    print()

    # Step 1: Extract video clips from zip (if available)
    videos_dir = os.path.join(data_dir, "videos")
    zip_path = os.path.join(input_dir, "5-second_MP4_Clips.zip")
    frames_dir = os.path.join(data_dir, "frames_fps_10")
    audios_dir = os.path.join(data_dir, "audios")

    if os.path.exists(zip_path):
        extract_clips(input_dir, videos_dir)
        print()

        # Find all MP4 files
        mp4_files = find_mp4_files(videos_dir)
        print(f"Found {len(mp4_files)} MP4 clips")
        print()

        # Step 2: Extract frames
        if not args.skip_frames:
            extract_frames(mp4_files, frames_dir, fps=args.fps, workers=args.workers)
        print()

        # Step 3: Extract audio
        if not args.skip_audio:
            extract_audio(mp4_files, audios_dir, workers=args.workers)
        print()
    else:
        print(f"NOTE: {zip_path} not found, skipping video/frame/audio extraction.")
        print("If you already have frames and audio extracted, set --data-dir to point there.")
        print()

    # Step 4: Generate split files from CSVs
    data_adcumen_dir = os.path.join(repo_root, "DataAdcumen")
    splits = generate_split_files(input_dir, data_adcumen_dir)
    print()

    # Step 5: Build VDB.pickle
    if splits:
        vdb_path = os.path.join(data_adcumen_dir, "VDB.pickle")
        build_vdb(splits, vdb_path)
    print()

    # Step 6: Extract weights
    extract_weights(input_dir, repo_root)
    print()

    # Summary
    print("=" * 60)
    print("Setup complete!")
    print()
    print("To run inference on the test set:")
    print()
    print("  python predict.py \\")
    print("    --data config/default.json \\")
    print("    --model weights \\")
    print("    --type test \\")
    print("    --id test_run")
    print()
    print("Results will be saved to ./data/predicted/test_run/")
    print("=" * 60)


if __name__ == "__main__":
    main()