Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ea0e47b
Added ability to import config defined fbi_index and fbi_annotations …
m-haines Jul 17, 2025
a0d80bc
Removed ceda-elasticsearch-client as it isn't needed and is making up…
m-haines Jul 29, 2025
080703b
Changed how conf.py worked, function call rather than passing variabl…
m-haines Jul 29, 2025
650df60
Further work to make fbi-core work with conf.py being a function rath…
m-haines Jul 29, 2025
9f632d7
Continuing to try and get fbi_indexer working in Kubernetes through s…
m-haines Jul 29, 2025
6777e91
Missed a comma in setup.py
m-haines Jul 29, 2025
14cb282
Added host_es, es_index and es_annotation definitions if the conf_fil…
m-haines Jul 29, 2025
feb0b04
Very quick adjustment of requirements.txt file to pin elasticsearch t…
m-haines Jul 30, 2025
a250837
Added pyproject.toml and poetry.lock to try and stop elasticsearch 9 …
m-haines Jul 30, 2025
ac22bc6
Corrected versioning for fbi-core in pyproject.toml
m-haines Jul 30, 2025
a0abc9d
Fixed pyproject.toml so that poetry.lock could be populated
m-haines Jul 30, 2025
f7ecf47
Initial fbi_flow_diagram with fbi index interaction points, especiall…
m-haines Aug 14, 2025
49e40b2
Further updates to flow diagram
m-haines Aug 14, 2025
0dbf549
Added catalog-info for Backstage documentation to fbi-core
m-haines Oct 23, 2025
8f0ee07
Added elasticsearch-index-fbi to dependencies in catalog-info
m-haines Oct 23, 2025
9e8ac07
Added fbi-annotations to fbi-core dependencies on backstage in catalo…
m-haines Oct 23, 2025
b5c9b5a
Updated conf.py (added requests and os), updated to Python 3.13 or gr…
m-haines Apr 20, 2026
0e6a06c
Merge remote-tracking branch 'origin/main' into index-to-config
m-haines May 13, 2026
87451c7
Changed to new es basic_auth method
m-haines May 13, 2026
e18047b
Changed to use user and password and not API key
m-haines Jun 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ build
*.egg-info
dist
dump_output
.DS_Store
.DS_Store
/home/
21 changes: 21 additions & 0 deletions catalog-info.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
apiVersion: backstage.io/v1alpha1
kind: Component
metadata:
name: fbi-core
description: "Core file based indexing (FBI) operations used by the NLA and other CEDA archiving systems."
tags:
- "priority:high"
- "visibility:private"
- "manager:sam-pepler"
- "deputy-manager:matthew-paice"
- "review:passed"
spec:
type: library
owner: group:default/developers
lifecycle: production
system: system:default/nla
dependsOn:
- resource:default/elasticsearch-index-fbi
- resource:default/elasticsearch-index-fbi-annotations
- component:default/fbi-annotations
18 changes: 11 additions & 7 deletions fbi_core/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,22 @@
from collections import defaultdict

import elasticsearch
import requests
from elasticsearch import Elasticsearch
import requests

from .conf import APIKEY, ES_HOSTS
from .conf import load_config

if APIKEY:
es = elasticsearch.Elasticsearch(hosts=ES_HOSTS, headers={"x-api-key": APIKEY})
else:
es = elasticsearch.Elasticsearch(hosts=ES_HOSTS)
user, password, host_es, es_index, es_annotation = load_config()

indexname = "fbi-annotations"
if user and password:
es = Elasticsearch(host_es, basic_auth=(user, password), request_timeout=10)
else:
es = Elasticsearch(host_es)

if es_annotation:
indexname = es_annotation
else:
indexname = "fbi-annotations"

def get_moles_records():
"""get moles info from catalogue"""
Expand Down
62 changes: 50 additions & 12 deletions fbi_core/conf.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,53 @@
import os

from requests import get, codes
import yaml

conf_file = os.path.join(os.environ["HOME"], ".fbi.yml")

if os.path.exists(conf_file):
conf = yaml.load(open(conf_file), Loader=yaml.Loader)
USERNAME = conf["ES"]["username"]
PASSWORD = conf["ES"]["password"]
ES_HOSTS = conf["ES"].get("hosts", ["https://elasticsearch.ceda.ac.uk:443"])
else:
USERNAME = None
PASSWORD = None
ES_HOSTS = ["https://elasticsearch.ceda.ac.uk:443"]
def load_config():
try:
conf_file = os.path.join(os.environ["HOME"], ".fbi.yml") # for prod in Linux based environment
except:
conf_file = os.path.join("./home/", ".fbi.yml") # For testing
if os.path.exists(conf_file):
conf = yaml.load(open(conf_file), Loader=yaml.Loader)
username = conf["ES"]["user"]
password = conf["ES"]["password"]
host_es = conf["ES"]["host"]

if "index_fbi" in conf["ES"]:
es_index = conf["ES"]["index_fbi"]
else:
es_index = "fbi-2022"

if "index_fbi_annotation" in conf["ES"]:
es_annotation = conf["ES"]["index_fbi_annotation"]
else:
es_annotation = "fbi-annotations"
else:
username = None
password = None
host_es = "https://elasticsearch.ceda.ac.uk:443"
es_index = "fbi-2022"
es_annotation = "fbi-annotations"

# Get spotlist on storage-d
spotlist_url = "https://cedaarchiveapp.ceda.ac.uk/storage-d/spotlist"
spots_page = get(spotlist_url)

if spots_page.status_code != codes['✓']:

print(spots_page.status_code + " Warning, error obtaining spotlist on storage-d from https://cedaarchiveapp.ceda.ac.uk/storage-d/spotlist, using cached values from 05-Dec-2025")
try:
spotlist_file = os.path.join(os.environ["HOME"], "spotlist-cache.yml") # for prod in Linux based environment
except:
spotlist_file = os.path.join("./home/", "spotlist-cache.yml") # For testing in Windows environment, won't work in Linux/ Mac OS as "HOME" env variable exists. So another solution for debugging would be required.
if os.path.exists(spotlist_file):
spotlist = yaml.load(open(spotlist_file), Loader=yaml.Loader)
spotlist = spotlist["spotlist_cache"]

else:
spotlist = None
else:
spotlist = spots_page.text.splitlines()

return username, password, host_es, es_index, es_annotation, spotlist

22 changes: 14 additions & 8 deletions fbi_core/fbi_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,22 @@
from datetime import datetime

import elasticsearch
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

from .conf import ES_HOSTS, PASSWORD, USERNAME
from .conf import load_config

if USERNAME:
es = elasticsearch.Elasticsearch(hosts=ES_HOSTS, basic_auth=(USERNAME, PASSWORD))
user, password, host_es, es_index, es_annotation, spotlist = load_config()

if user and password:
es = Elasticsearch(hosts=host_es, basic_auth=(user, password), request_timeout=10)
else:
es = elasticsearch.Elasticsearch(hosts=ES_HOSTS)
es = Elasticsearch(hosts=host_es)

indexname = "fbi-2022"
if es_index:
indexname = es_index
else:
indexname = "fbi-2022"


def fbi_records(
Expand Down Expand Up @@ -449,7 +455,7 @@ def splits(path, batch_size=10000000, **kwargs):

def make_dirs(directory):
"""
Make FBI records for a diretory and any missing parent directories.
Make FBI records for a directory and any missing parent directories.

:param str directory: The directory to add.
"""
Expand Down Expand Up @@ -503,7 +509,7 @@ def fbi_listdir(


def insert_item(record):
"""Insert record by replaceing it"""
"""Insert record by replacing it"""
record_id = _create_id(record["path"])
try:
es.delete(index=indexname, id=record_id)
Expand All @@ -513,7 +519,7 @@ def insert_item(record):


def update_item(record):
"""Update a single document - overwrite feilds in record suplied."""
"""Update a single document - overwrite fields in record suplied."""
document = {"doc": record, "doc_as_upsert": True}
es.update(
index=indexname,
Expand Down
102 changes: 102 additions & 0 deletions fbi_loc_flow_diagram.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
```mermaid
---
title: NLA flow diagram with fbi interactions, specifically location amendments
---

graph TB
%% Disks
external_data@{ shape: docs, label: "External data" }
arrivals_service@{ shape: lin-cyl, label: "Arrivals Service \n Disk" }
data_processing@{ shape: lin-cyl, label: "Datacentre Processing \n Disk" }
CEDA_archive@{ shape: lin-cyl, label: "CEDA Archive \n Disk" }
tape_archive@{ shape: lin-cyl, label: "CEDA Backup \n Tape" }
cloud@{ shape: lin-cyl, label: "CEDA Cloud \n Cloud" }
s3@{ shape: lin-cyl, label: "CEDA Object Store \n Amazon s3" }
nla_cache@{ shape: lin-cyl, label: "NLA Cache \n Disk" }


%% Databases
archiveapp_db@{ shape: cyl, label: "cedaarchiveapp \n Database" }
nla_control_db@{ shape: cyl, label: "NLA Control \n Database" }

%% Elasticsearch / RabbitMQ
fbi_index@{ shape: docs, label: "fbi-2022 \n Elasticsearch \n index" }
rabbit_deposit@{ shape: docs, label: "f \n RabbitMQ" }

%% Processes (inc repository where necessary)
temp@{ shape: rect, label: "temp" }
temp2@{ shape: rect, label: "temp" }
temp3@{ shape: rect, label: "temp" }
temp4@{ shape: rect, label: "temp" }
temp5@{ shape: rect, label: "temp" }
temp6@{ shape: rect, label: "temp" }
temp7@{ shape: rect, label: "temp cloud" }
temp8@{ shape: rect, label: "temp" }

%%Flowchart general
%% Archive to tape
CEDA_archive --> temp6
temp6 --> tape_archive

%% Archive to cloud
CEDA_archive --> temp7
temp7 --> cloud

%% Tape to NLA_cache
tape_archive --> temp4
temp4 --> nla_cache

%% Archive to ObjectStore
CEDA_archive --> temp8
temp8 --> s3


%% Flowchart storage
subgraph storage[CEDA Storage]
external_data -- "API" --> arrivals_service
external_data -- "Received data" --> data_processing
arrivals_service -- "scripts" --> CEDA_archive
arrivals_service -- "scripts" --> data_processing
data_processing -- "data scientists" --> CEDA_archive
tape_archive
cloud
s3
nla_cache
end


%% Servers/ k8s (need subgraphs for each of these with processes within them)
subgraph deposit_server[Deposit Server]
temp
end

subgraph ingest_machine[Ingest VM]
temp2
end

subgraph ceda_aa[cedaarchiveapp VM]
temp3
end

subgraph nla_vm[NLA VM]
temp4
end

subgraph fbi_deposit[FBI Deposit k8s]
temp5
end

subgraph backup[Backup VM]
temp6
end

subgraph cloud_processing[Cloud Processing]
temp7
end

subgraph s3_processing["Object Store (S3) Processing"]
temp8
end


```
Loading
Loading