Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ It supports seven models, four features (video and audio features), and six data
Furthermore, Lighthouse supports [audio moment retrieval](https://h-munakata.github.io/Language-based-Audio-Moment-Retrieval/), a task to identify relevant moments from an audio input based on a given text query.

## News
- [2025/11/20] [Version 1.2](https://github.com/line/lighthouse/releases/tag/v1.2) Our work ["CASTELLA: Long Audio Dataset with Captions and Temporal Boundaries"](https://arxiv.org/abs/2511.15131) has been released. This update adds support for a new AMR dataset called CASTELLA.
- [2025/06/04] [Version 1.1](https://github.com/line/lighthouse/releases/tag/v1.1) has been released. It includes API changes, AMR gradio demo, and huggingface wrappers for the audio moment retrieval and clotho dataset.
- [2024/12/24] Our work ["Language-based audio moment retrieval"](https://arxiv.org/abs/2409.15672) has been accepted at ICASSP 2025.
- [2024/10/22] [Version 1.0](https://github.com/line/lighthouse/releases/tag/v1.0) has been released.
Expand Down Expand Up @@ -142,6 +143,7 @@ Audio moment retrieval
### Pre-trained weights
Pre-trained weights can be downloaded from [here](https://drive.google.com/file/d/1jxs_bvwttXTF9Lk3aKLohkqfYOonLyrO/view?usp=sharing).
Download and unzip on the home directory.
AMR models trained on CASTELLA and Clotho-Moment is available in [here](https://zenodo.org/uploads/17422909)

### Datasets
Due to the copyright issue, we here distribute only feature files.
Expand All @@ -158,6 +160,7 @@ To extract features from videos, we use [HERO_Video_Feature_Extractor](https://g
For [AMR](https://h-munakata.github.io/Language-based-Audio-Moment-Retrieval/), download features from here.

- [Clotho Moment/TUT2017/UnAV100-subset](https://zenodo.org/records/13806234)
- [CASTELLA](https://zenodo.org/records/17412176) [[Mirror on HF]](https://huggingface.co/datasets/lighthouse-emnlp2024/CASTELLA_CLAP_features)

The whole directory should be look like this:
```
Expand Down
1 change: 1 addition & 0 deletions configs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ eval_bsz: 100
grad_clip: 0.1
max_q_l: 32
max_v_l: 75
max_a_l: 75
max_windows: 5
clip_length: 1
eval_epoch_interval: 1
Expand Down
7 changes: 7 additions & 0 deletions configs/dataset/castella.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
dset_name: castella
clip_length: 1
train_path: data/castella/castella_train_release.jsonl
eval_path: data/castella/castella_val_release.jsonl

max_a_l: 300
max_v_l: 300
1,347 changes: 1,347 additions & 0 deletions data/castella/castella_test_release.jsonl

Large diffs are not rendered by default.

2,182 changes: 2,182 additions & 0 deletions data/castella/castella_train_release.jsonl

Large diffs are not rendered by default.

352 changes: 352 additions & 0 deletions data/castella/castella_val_release.jsonl

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions training/cg_detr_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __getitem__(self, index):
else:
model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
self.get_saliency_labels_all(meta["relevant_clip_ids"], meta["saliency_scores"], ctx_l)
elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017']: ## charades, tacos, nlq
elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella']: ## charades, tacos, nlq
model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
self.get_saliency_labels_sub_as_query(meta["relevant_windows"][0], meta["duration"], ctx_l) # only one gt
else:
Expand Down Expand Up @@ -458,7 +458,7 @@ def _get_audio_feat_by_vid(self, vid):
raise NotImplementedError
_feat = l2_normalize_np_array(_feat) # normalize?
a_feat_list.append(_feat)
elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017']:
elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
if self.a_feat_types == "clap":
_feat_path = join(_feat_dir, f"{vid}.npz")
_feat = np.load(_feat_path)["features"][:self.max_a_l].astype(np.float32)
Expand Down
4 changes: 2 additions & 2 deletions training/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def __getitem__(self, index):
model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
self.get_saliency_labels_all(meta["relevant_clip_ids"], meta["saliency_scores"], ctx_l)

elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017']:
elif self.dset_name in ['charades', 'tacos', 'activitynet', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
model_inputs["saliency_pos_labels"], model_inputs["saliency_neg_labels"], model_inputs["saliency_all_labels"] = \
self.get_saliency_labels_sub_as_query(meta["relevant_windows"][0], ctx_l)
else:
Expand Down Expand Up @@ -480,7 +480,7 @@ def _get_audio_feat_by_vid(self, vid):
raise NotImplementedError
_feat = l2_normalize_np_array(_feat) # normalize?
a_feat_list.append(_feat)
elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017']:
elif self.dset_name in ['clotho-moment', 'unav100-subset', 'tut2017', 'castella']:
if self.a_feat_types == "clap":
_feat_path = join(_feat_dir, f"{vid}.npz")
_feat = np.load(_feat_path)["features"][:self.max_a_l].astype(np.float32)
Expand Down
15 changes: 12 additions & 3 deletions training/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,12 @@ def compute_mr_results(epoch_i, model, eval_loader, opt, criterion=None):
min_w_l=2, max_w_l=60, move_window_method="left",
process_func_names=("clip_ts", "round_multiple")
)
elif opt.dset_name in ['castella']:
post_processor = PostProcessorDETR(
clip_length=opt.clip_length, min_ts_val=0, max_ts_val=300,
min_w_l=1, max_w_l=300, move_window_method="left",
process_func_names=("clip_ts", "round_multiple")
)
elif opt.dset_name in ['tacos', 'activitynet', 'youtube_highlight']:
post_processor = PostProcessorDETR(
clip_length=opt.clip_length, min_ts_val=0, max_ts_val=50000,
Expand Down Expand Up @@ -367,6 +373,7 @@ def start_inference(opt, domain=None):
a_feat_types=opt.a_feat_types,
max_q_l=opt.max_q_l,
max_v_l=opt.max_v_l,
max_a_l=opt.max_a_l,
clip_len=opt.clip_length,
max_windows=opt.max_windows,
span_loss_type=opt.span_loss_type,
Expand All @@ -375,7 +382,7 @@ def start_inference(opt, domain=None):

eval_dataset = CGDETR_StartEndDataset(**dataset_config) if opt.model_name == 'cg_detr' else StartEndDataset(**dataset_config)
model, criterion, _, _ = setup_model(opt)
checkpoint = torch.load(opt.model_path)
checkpoint = torch.load(opt.model_path, weights_only=False)
model.load_state_dict(checkpoint["model"])
logger.info("Model checkpoint: {}".format(opt.model_path))
if not load_labels:
Expand All @@ -402,6 +409,8 @@ def check_valid_combination(dataset, feature, domain):
'tvsum': ['resnet_glove', 'clip', 'clip_slowfast', 'i3d_clip'],
'youtube_highlight': ['clip', 'clip_slowfast'],
'clotho-moment': ['clap'],
'unav100-subset': ['clap'],
'castella': ['clap'],
}

domain_map = {
Expand All @@ -421,8 +430,8 @@ def check_valid_combination(dataset, feature, domain):
choices=['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom', 'tr_detr', 'taskweave_hd2mr', 'taskweave_mr2hd'],
help='model name. select from [moment_detr, qd_detr, eatr, cg_detr, uvcom, tr_detr, taskweave_hd2mr, taskweave_mr2hd]')
parser.add_argument('--dataset', '-d', type=str, required=True,
choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'unav100-subset', 'tut2017'],
help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, unav100-subset, tut2017]')
choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'unav100-subset', 'tut2017', 'castella'],
help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, unav100-subset, tut2017, castella]')
parser.add_argument('--feature', '-f', type=str, required=True,
choices=['resnet_glove', 'clip', 'clip_slowfast', 'clip_slowfast_pann', 'i3d_clip', 'clap'],
help='feature name. select from [resnet_glove, clip, clip_slowfast, clip_slowfast_pann, i3d_clip, clap].'
Expand Down
9 changes: 5 additions & 4 deletions training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ def train_epoch(model, criterion, train_loader, optimizer, opt, epoch_i):
losses.backward()
else:
outputs = model(**model_inputs, targets=targets) if opt.model_name == 'cg_detr' else model(**model_inputs)

loss_dict = criterion(outputs, targets)
losses = sum(loss_dict[k] * criterion.weight_dict[k] for k in loss_dict.keys() if k in criterion.weight_dict)

Expand Down Expand Up @@ -228,6 +227,7 @@ def main(opt, resume=None, domain=None):
a_feat_types=opt.a_feat_types,
max_q_l=opt.max_q_l,
max_v_l=opt.max_v_l,
max_a_l=opt.max_a_l,
clip_len=opt.clip_length,
max_windows=opt.max_windows,
span_loss_type=opt.span_loss_type,
Expand All @@ -246,7 +246,7 @@ def main(opt, resume=None, domain=None):

# load checkpoint for QVHighlight pretrain -> finetune
if resume is not None:
checkpoint = torch.load(resume)
checkpoint = torch.load(resume, weights_only=False)
model.load_state_dict(checkpoint["model"])
logger.info("Loaded model checkpoint: {}".format(resume))

Expand All @@ -267,6 +267,7 @@ def check_valid_combination(dataset, feature, domain):
'tvsum': ['resnet_glove', 'clip', 'clip_slowfast', 'i3d_clip'],
'youtube_highlight': ['clip', 'clip_slowfast'],
'clotho-moment': ['clap'],
'castella': ['clap'],
}

domain_map = {
Expand All @@ -286,8 +287,8 @@ def check_valid_combination(dataset, feature, domain):
choices=['moment_detr', 'qd_detr', 'eatr', 'cg_detr', 'uvcom', 'tr_detr', 'taskweave_hd2mr', 'taskweave_mr2hd'],
help='model name. select from [moment_detr, qd_detr, eatr, cg_detr, uvcom, tr_detr, taskweave_hd2mr, taskweave_mr2hd]')
parser.add_argument('--dataset', '-d', type=str, required=True,
choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment'],
help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment]')
choices=['activitynet', 'charades', 'qvhighlight', 'qvhighlight_pretrain', 'tacos', 'tvsum', 'youtube_highlight', 'clotho-moment', 'castella'],
help='dataset name. select from [activitynet, charades, qvhighlight, qvhighlight_pretrain, tacos, tvsum, youtube_highlight, clotho-moment, castella]')
parser.add_argument('--feature', '-f', type=str, required=True,
choices=['resnet_glove', 'clip', 'clip_slowfast', 'clip_slowfast_pann', 'i3d_clip', 'clap'],
help='feature name. select from [resnet_glove, clip, clip_slowfast, clip_slowfast_pann, i3d_clip, clap].'
Expand Down