cedadev · m-haines · Jun 10, 2026 · Jul 17, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ build
 *.egg-info
 dist
 dump_output
-.DS_Store
+.DS_Store
+/home/
diff --git a/catalog-info.yaml b/catalog-info.yaml
@@ -0,0 +1,21 @@
+---
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: fbi-core
+  description: "Core file based indexing (FBI) operations used by the NLA and other CEDA archiving systems."
+  tags:
+    - "priority:high"
+    - "visibility:private"
+    - "manager:sam-pepler"
+    - "deputy-manager:matthew-paice"
+    - "review:passed"
+spec:
+  type: library
+  owner: group:default/developers
+  lifecycle: production
+  system: system:default/nla
+  dependsOn:
+    - resource:default/elasticsearch-index-fbi
+    - resource:default/elasticsearch-index-fbi-annotations
+    - component:default/fbi-annotations
diff --git a/fbi_core/annotate.py b/fbi_core/annotate.py
@@ -5,18 +5,22 @@
 from collections import defaultdict
 
 import elasticsearch
-import requests
 from elasticsearch import Elasticsearch
+import requests
 
-from .conf import APIKEY, ES_HOSTS
+from .conf import load_config
 
-if APIKEY:
-    es = elasticsearch.Elasticsearch(hosts=ES_HOSTS, headers={"x-api-key": APIKEY})
-else:
-    es = elasticsearch.Elasticsearch(hosts=ES_HOSTS)
+user, password, host_es, es_index, es_annotation = load_config()
 
-indexname = "fbi-annotations"
+if user and password:
+    es = Elasticsearch(host_es, basic_auth=(user, password), request_timeout=10)
+else:
+    es = Elasticsearch(host_es)
 
+if es_annotation:
+    indexname = es_annotation
+else:
+    indexname = "fbi-annotations"
 
 def get_moles_records():
     """get moles info from catalogue"""

diff --git a/fbi_core/conf.py b/fbi_core/conf.py
@@ -1,15 +1,53 @@
 import os
-
+from requests import get, codes
 import yaml
 
-conf_file = os.path.join(os.environ["HOME"], ".fbi.yml")
-
-if os.path.exists(conf_file):
-    conf = yaml.load(open(conf_file), Loader=yaml.Loader)
-    USERNAME = conf["ES"]["username"]
-    PASSWORD = conf["ES"]["password"]
-    ES_HOSTS = conf["ES"].get("hosts", ["https://elasticsearch.ceda.ac.uk:443"])
-else:
-    USERNAME = None
-    PASSWORD = None
-    ES_HOSTS = ["https://elasticsearch.ceda.ac.uk:443"]
+def load_config():
+    try:
+        conf_file = os.path.join(os.environ["HOME"], ".fbi.yml") # for prod in Linux based environment
+    except:
+        conf_file = os.path.join("./home/", ".fbi.yml") # For testing
+    if os.path.exists(conf_file):
+        conf = yaml.load(open(conf_file), Loader=yaml.Loader)
+        username = conf["ES"]["user"]
+        password = conf["ES"]["password"]
+        host_es = conf["ES"]["host"]
+
+        if "index_fbi" in conf["ES"]:
+            es_index = conf["ES"]["index_fbi"]
+        else:
+            es_index = "fbi-2022"
+
+        if "index_fbi_annotation" in conf["ES"]:
+            es_annotation = conf["ES"]["index_fbi_annotation"]
+        else:
+            es_annotation = "fbi-annotations"
+    else:
+        username = None
+        password = None
+        host_es = "https://elasticsearch.ceda.ac.uk:443"
+        es_index = "fbi-2022"
+        es_annotation = "fbi-annotations"
+
+    # Get spotlist on storage-d
+    spotlist_url = "https://cedaarchiveapp.ceda.ac.uk/storage-d/spotlist"
+    spots_page = get(spotlist_url)
+
+    if spots_page.status_code != codes['✓']:
+
+        print(spots_page.status_code + " Warning, error obtaining spotlist on storage-d  from https://cedaarchiveapp.ceda.ac.uk/storage-d/spotlist, using cached values from 05-Dec-2025")
+        try:
+            spotlist_file = os.path.join(os.environ["HOME"], "spotlist-cache.yml") # for prod in Linux based environment
+        except:
+            spotlist_file = os.path.join("./home/", "spotlist-cache.yml") # For testing in Windows environment, won't work in Linux/ Mac OS as "HOME" env variable exists. So another solution for debugging would be required.
+        if os.path.exists(spotlist_file):
+            spotlist = yaml.load(open(spotlist_file), Loader=yaml.Loader)
+            spotlist = spotlist["spotlist_cache"]
+
+        else:
+            spotlist = None
+    else:
+        spotlist = spots_page.text.splitlines()
+
+    return username, password, host_es, es_index, es_annotation, spotlist
+
diff --git a/fbi_core/fbi_tools.py b/fbi_core/fbi_tools.py
@@ -5,16 +5,22 @@
 from datetime import datetime
 
 import elasticsearch
+from elasticsearch import Elasticsearch
 from elasticsearch.helpers import scan
 
-from .conf import ES_HOSTS, PASSWORD, USERNAME
+from .conf import load_config
 
-if USERNAME:
-    es = elasticsearch.Elasticsearch(hosts=ES_HOSTS, basic_auth=(USERNAME, PASSWORD))
+user, password, host_es, es_index, es_annotation, spotlist = load_config()
+
+if user and password:
+    es = Elasticsearch(hosts=host_es, basic_auth=(user, password), request_timeout=10)
 else:
-    es = elasticsearch.Elasticsearch(hosts=ES_HOSTS)
+    es = Elasticsearch(hosts=host_es)
 
-indexname = "fbi-2022"
+if es_index:
+    indexname = es_index
+else:
+    indexname = "fbi-2022"
 
 
 def fbi_records(
@@ -449,7 +455,7 @@ def splits(path, batch_size=10000000, **kwargs):
 
 def make_dirs(directory):
     """
-    Make FBI records for a diretory and any missing parent directories.
+    Make FBI records for a directory and any missing parent directories.
 
     :param str directory: The directory to add.
     """
@@ -503,7 +509,7 @@ def fbi_listdir(
 
 
 def insert_item(record):
-    """Insert record by replaceing it"""
+    """Insert record by replacing it"""
     record_id = _create_id(record["path"])
     try:
         es.delete(index=indexname, id=record_id)
@@ -513,7 +519,7 @@ def insert_item(record):
 
 
 def update_item(record):
-    """Update a single document - overwrite feilds in record suplied."""
+    """Update a single document - overwrite fields in record suplied."""
     document = {"doc": record, "doc_as_upsert": True}
     es.update(
         index=indexname,

diff --git a/fbi_loc_flow_diagram.md b/fbi_loc_flow_diagram.md
@@ -0,0 +1,102 @@
+```mermaid
+---
+title: NLA flow diagram with fbi interactions, specifically location amendments
+---
+
+graph TB
+%% Disks
+external_data@{ shape: docs, label: "External data" }
+arrivals_service@{ shape: lin-cyl, label: "Arrivals Service \n Disk" }
+data_processing@{ shape: lin-cyl, label: "Datacentre Processing \n Disk" }
+CEDA_archive@{ shape: lin-cyl, label: "CEDA Archive \n Disk" }
+tape_archive@{ shape: lin-cyl, label: "CEDA Backup \n Tape" }
+cloud@{ shape: lin-cyl, label: "CEDA Cloud \n Cloud" }
+s3@{ shape: lin-cyl, label: "CEDA Object Store \n Amazon s3" }
+nla_cache@{ shape: lin-cyl, label: "NLA Cache \n Disk" }
+
+
+%% Databases
+archiveapp_db@{ shape: cyl, label: "cedaarchiveapp \n Database" }
+nla_control_db@{ shape: cyl, label: "NLA Control \n Database" }
+
+%% Elasticsearch / RabbitMQ
+fbi_index@{ shape: docs, label: "fbi-2022 \n Elasticsearch \n index" }
+rabbit_deposit@{ shape: docs, label: "f \n RabbitMQ" }
+
+%% Processes (inc repository where necessary)
+temp@{ shape: rect, label: "temp" }
+temp2@{ shape: rect, label: "temp" }
+temp3@{ shape: rect, label: "temp" }
+temp4@{ shape: rect, label: "temp" }
+temp5@{ shape: rect, label: "temp" }
+temp6@{ shape: rect, label: "temp" }
+temp7@{ shape: rect, label: "temp cloud" }
+temp8@{ shape: rect, label: "temp" }
+
+%%Flowchart general
+%% Archive to tape
+CEDA_archive --> temp6
+temp6 --> tape_archive
+
+%% Archive to cloud
+CEDA_archive --> temp7
+temp7 --> cloud
+
+%% Tape to NLA_cache
+tape_archive --> temp4
+temp4 --> nla_cache
+
+%% Archive to ObjectStore
+CEDA_archive --> temp8
+temp8 --> s3
+
+
+%% Flowchart storage
+subgraph storage[CEDA Storage]
+external_data -- "API" --> arrivals_service
+external_data -- "Received data" --> data_processing
+arrivals_service -- "scripts" --> CEDA_archive
+arrivals_service -- "scripts" --> data_processing
+data_processing -- "data scientists" --> CEDA_archive
+tape_archive
+cloud
+s3
+nla_cache
+end
+
+
+%% Servers/ k8s (need subgraphs for each of these with processes within them)
+subgraph deposit_server[Deposit Server]
+temp
+end
+
+subgraph ingest_machine[Ingest VM]
+temp2
+end
+
+subgraph ceda_aa[cedaarchiveapp VM]
+temp3
+end
+
+subgraph nla_vm[NLA VM]
+temp4
+end
+
+subgraph fbi_deposit[FBI Deposit k8s]
+temp5
+end
+
+subgraph backup[Backup VM]
+temp6
+end
+
+subgraph cloud_processing[Cloud Processing]
+temp7
+end
+
+subgraph s3_processing["Object Store (S3) Processing"]
+temp8
+end
+
+
+```