From b6a752261a916ce1948d87d8e350ede2ff28ea85 Mon Sep 17 00:00:00 2001
From: alexpetit <petit.alexandre22@gmail.com>
Date: Sun, 1 Mar 2026 20:07:06 +0100
Subject: [PATCH 1/5] feat: ajout des fonctions pour l'ingestion API

---
 biolit/export_api.py | 65 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 biolit/export_api.py

diff --git a/biolit/export_api.py b/biolit/export_api.py
new file mode 100644
index 0000000..0697ba0
--- /dev/null
+++ b/biolit/export_api.py
@@ -0,0 +1,65 @@
+import polars as pl
+import requests
+
+###Test export from API
+
+def fetch_biolit_api(per_page: int = 1000):
+    all_data = []
+    page = 1
+    print("Téléchargement des données depuis l'API Biolit...")
+    while True:
+        url = f"https://biolit.fr/wp-json/biolitapi/v1/observations/all?per_page={per_page}&page={page}"
+        r = requests.get(url)
+        data = r.json()
+        if not data:
+            break
+        all_data.extend(data)
+        page += 1
+        print(f"Page {page} téléchargée, total observations : {len(all_data)}")
+
+    if not all_data:
+        return pl.DataFrame([])
+    df = pl.DataFrame(all_data)
+    print(df.head())
+    print(df.shape)
+    return df
+
+
+def adapt_api_to_parquet_schema(df):
+    return (
+        df.rename({
+            "id": "id",
+            "link": "lien",
+            "author": "auteur",
+            "date": "date",
+            "heure-debut": "heure-de-debut",
+            "heure-fin": "heure-de-fin",
+            "latitude": "latitude",
+            "longitude": "longitude",
+            "photos": "images",
+            "espece": "nom_scientifique",
+            "common": "nom_commun",
+        })
+        .with_columns([
+            pl.col("lien").str.split("/").list.get(-1).alias("titre"),  # dernier segment du lien
+            pl.lit("").alias("validee"),
+            pl.lit("TBD").alias("espece_identifiable_?"),
+            pl.lit("API").alias("protocole"),
+        ])
+    )
+def load_biolit_from_api() -> pl.DataFrame:
+    df_api = fetch_biolit_api()
+    if df_api.is_empty():
+        return df_api
+    print(adapt_api_to_parquet_schema(df_api).head())
+    print(adapt_api_to_parquet_schema(df_api).columns)
+    return adapt_api_to_parquet_schema(df_api)
+
+def format_observations_from_api():
+### A faire plus tard
+    return 
+
+
+if __name__ == "__main__":
+    print("Script lancé")
+    load_biolit_from_api()
\ No newline at end of file

From f2be0e8854d3e3f00cc593a60b54fcc46ff49b4b Mon Sep 17 00:00:00 2001
From: alexpetit <petit.alexandre22@gmail.com>
Date: Sun, 15 Mar 2026 12:49:14 +0100
Subject: [PATCH 2/5] nouvelle API URL, modification des champs avec tous les
 champs de l'API, ajout de test(limite aux 5 premieres pages pour le moment)

---
 biolit/export_api.py     | 165 ++++++++++++++++++++++++++-------------
 tests/test_export_api.py |  69 ++++++++++++++++
 2 files changed, 178 insertions(+), 56 deletions(-)
 create mode 100644 tests/test_export_api.py

diff --git a/biolit/export_api.py b/biolit/export_api.py
index 0697ba0..93ae428 100644
--- a/biolit/export_api.py
+++ b/biolit/export_api.py
@@ -1,65 +1,118 @@
-import polars as pl
 import requests
+import polars as pl
+import structlog
+
+LOGGER = structlog.get_logger()
 
-###Test export from API
+# ------------------------------
+# Helper pour récupérer une clé dans meta
+# ------------------------------
+def get_meta(meta: dict, key: str):
+    """Retourne la première valeur d'une clé meta, ou None si absente"""
+    if not meta:
+        return None
+    value = meta.get(key)
+    if isinstance(value, list) and value:
+        return value[0]
+    return value
 
-def fetch_biolit_api(per_page: int = 1000):
+
+# ------------------------------
+# FETCH API
+# ------------------------------
+def fetch_biolit_from_api(per_page=100, max_pages=5):
+    """
+    Récupère les observations depuis l'API Biolit.
+    Limite par défaut à max_pages pour éviter les 150+ pages.
+    """
+    url_base = "https://biolit.fr/wp-json/biolitapi/v1/observations"
     all_data = []
-    page = 1
-    print("Téléchargement des données depuis l'API Biolit...")
-    while True:
-        url = f"https://biolit.fr/wp-json/biolitapi/v1/observations/all?per_page={per_page}&page={page}"
-        r = requests.get(url)
-        data = r.json()
+
+    for page in range(1, max_pages + 1):
+        url = f"{url_base}?per_page={per_page}&page={page}"
+        LOGGER.info(f"Fetching page {page} from API")
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
         if not data:
             break
         all_data.extend(data)
-        page += 1
-        print(f"Page {page} téléchargée, total observations : {len(all_data)}")
-
-    if not all_data:
-        return pl.DataFrame([])
-    df = pl.DataFrame(all_data)
-    print(df.head())
-    print(df.shape)
-    return df
 
+    LOGGER.info(f"Fetched {len(all_data)} observations total")
+    return all_data
+
+
+# ------------------------------
+# ADAPT API -> PARQUET
+# ------------------------------
+def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame:
+    """
+    Transforme la structure API Biolit en DataFrame pour parquet.
+    """
+    rows = []
+
+    for item in data:
+        obs = item.get("observation", {})
+        meta = obs.get("meta", {})
+        parents = item.get("parents", {})
+        especes = item.get("especes", [])
+
+        quadra = parents.get("quadra", {})
+        abb = parents.get("abb", {})
+        quadra_meta = quadra.get("meta", {})
+        abb_meta = abb.get("meta", {})
+
+        # Gestion des espèces
+        nom_scientifique = None
+        nom_commun = None
+        nombre_mollusques = None
+
+        if especes:
+            nom_scientifique = especes[0].get("nom")
+            nombre_mollusques = especes[0].get("nombre_presents")
+
+        row = {
+            # Niveau N1 (Quadra)
+            "protocole": get_meta(meta, "jet_tax__protocole"),
+            "ID - N1": quadra.get("ID"),
+            "titre - N1": quadra.get("title"),
+            "lien - N1": get_meta(meta, "_url_sortie"),
+            "auteur - N1": None,
+            "images - N1": None,
+            "date - N1": obs.get("date"),
+            "heure-de-debut - N1": get_meta(meta, "heure-debut"),
+            "heure-de-fin - N1": get_meta(meta, "heure-fin"),
+            "latitude - N1": get_meta(meta, "latitude"),
+            "longitude - N1": get_meta(meta, "longitude"),
+            "relais-local - N1": get_meta(abb_meta, "relais-local"),
+            "nom du lieu - N1": get_meta(abb_meta, "nom-du-lieu-abb"),
+
+            # Observation
+            "ID - observation": obs.get("ID"),
+            "titre - observation": obs.get("title"),
+            "lien - observation": obs.get("link"),
+            "Nom scientifique - observation": nom_scientifique,
+            "Nom commun - observation": nom_commun,
+            "programme espèce": get_meta(meta, "jet_tax__categorie-programme"),
+            "images - observation": obs.get("images"),
+            "nombre de mollusques - observation": nombre_mollusques,
+            "validee - observation": get_meta(meta, "validee"),
+            "espece identifiable ? - observation": get_meta(meta, "espece-identifiee"),
+        }
+
+        rows.append(row)
+
+    return pl.DataFrame(rows)
+
+
+# ------------------------------
+# LOAD (Fetch + Adapt)
+# ------------------------------
+def load_biolit_from_api(per_page=100, max_pages=5) -> pl.DataFrame:
+    """
+    Récupère et transforme les données Biolit depuis l'API.
+    """
+    raw_data = fetch_biolit_from_api(per_page=per_page, max_pages=max_pages)
+    df = adapt_api_to_parquet_schema(raw_data)
+    return df
 
-def adapt_api_to_parquet_schema(df):
-    return (
-        df.rename({
-            "id": "id",
-            "link": "lien",
-            "author": "auteur",
-            "date": "date",
-            "heure-debut": "heure-de-debut",
-            "heure-fin": "heure-de-fin",
-            "latitude": "latitude",
-            "longitude": "longitude",
-            "photos": "images",
-            "espece": "nom_scientifique",
-            "common": "nom_commun",
-        })
-        .with_columns([
-            pl.col("lien").str.split("/").list.get(-1).alias("titre"),  # dernier segment du lien
-            pl.lit("").alias("validee"),
-            pl.lit("TBD").alias("espece_identifiable_?"),
-            pl.lit("API").alias("protocole"),
-        ])
-    )
-def load_biolit_from_api() -> pl.DataFrame:
-    df_api = fetch_biolit_api()
-    if df_api.is_empty():
-        return df_api
-    print(adapt_api_to_parquet_schema(df_api).head())
-    print(adapt_api_to_parquet_schema(df_api).columns)
-    return adapt_api_to_parquet_schema(df_api)
-
-def format_observations_from_api():
-### A faire plus tard
-    return 
-
-
-if __name__ == "__main__":
-    print("Script lancé")
-    load_biolit_from_api()
\ No newline at end of file
diff --git a/tests/test_export_api.py b/tests/test_export_api.py
new file mode 100644
index 0000000..0cea68a
--- /dev/null
+++ b/tests/test_export_api.py
@@ -0,0 +1,69 @@
+
+from biolit.export_api import fetch_biolit_from_api, load_biolit_from_api, adapt_api_to_parquet_schema
+
+# -------------------------
+# Fonctions d'inspection
+# -------------------------
+
+def inspect_api_structure(raw_data):
+    """Affiche les clés du niveau supérieur dans l'API"""
+    keys = set()
+    for item in raw_data:
+        keys.update(item.keys())
+    print("TOP LEVEL KEYS:")
+    for k in sorted(keys):
+        print("-", k)
+
+def inspect_meta_keys(raw_data):
+    """Affiche toutes les clés présentes dans le champ 'meta' de chaque observation"""
+    meta_keys = set()
+    for item in raw_data:
+        if "observation" not in item:
+            continue
+        meta = item["observation"].get("meta", {})
+        meta_keys.update(meta.keys())
+    print("META KEYS:")
+    for k in sorted(meta_keys):
+        print("-", k)
+
+def inspect_meta_values(raw_data, field):
+    """Affiche les valeurs et leur quantité pour un champ du meta"""
+    values = {}
+    for item in raw_data:
+        if "observation" not in item:
+            continue
+        meta = item["observation"].get("meta", {})
+        val = meta.get(field)
+        if isinstance(val, list) and val:
+            val = val[0]
+        values[val] = values.get(val, 0) + 1
+    print(f"\nVALUES FOR {field}:")
+    for k, v in values.items():
+        print(k, ":", v)
+
+# -------------------------
+# Tests
+# -------------------------
+
+def test_fetch_small_sample():
+    """Test fetch API avec un petit nombre de pages pour debug"""
+    raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5)  # petit sample
+    assert isinstance(raw_data, list)
+    print(f"\nNombre total d'éléments récupérés: {len(raw_data)}")
+    inspect_api_structure(raw_data)
+    inspect_meta_keys(raw_data)
+    inspect_meta_values(raw_data, "validee")
+
+def test_load_and_adapt():
+    """Test l'adaptation de l'API vers le schéma parquet"""
+    raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5)
+    df = adapt_api_to_parquet_schema(raw_data)
+    print(f"\nDataframe adapté avec {len(df)} lignes")
+    print(df.head(10))  # affiche les 10 premières lignes pour vérification
+
+
+if __name__ == "__main__":
+    print("=== TEST FETCH SMALL SAMPLE ===")
+    test_fetch_small_sample()
+    print("\n=== TEST LOAD AND ADAPT ===")
+    test_load_and_adapt()
\ No newline at end of file

From a04b0ed253b024c7d2e9c33510d96381d6964452 Mon Sep 17 00:00:00 2001
From: alexpetit <petit.alexandre22@gmail.com>
Date: Tue, 17 Mar 2026 21:38:07 +0100
Subject: [PATCH 3/5] Fix pre-commit issues and update API processing

---
 biolit/export_api.py     | 1 -
 tests/test_export_api.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/biolit/export_api.py b/biolit/export_api.py
index 93ae428..f3a99a0 100644
--- a/biolit/export_api.py
+++ b/biolit/export_api.py
@@ -59,7 +59,6 @@ def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame:
 
         quadra = parents.get("quadra", {})
         abb = parents.get("abb", {})
-        quadra_meta = quadra.get("meta", {})
         abb_meta = abb.get("meta", {})
 
         # Gestion des espèces
diff --git a/tests/test_export_api.py b/tests/test_export_api.py
index 0cea68a..8560417 100644
--- a/tests/test_export_api.py
+++ b/tests/test_export_api.py
@@ -1,5 +1,5 @@
 
-from biolit.export_api import fetch_biolit_from_api, load_biolit_from_api, adapt_api_to_parquet_schema
+from biolit.export_api import fetch_biolit_from_api, adapt_api_to_parquet_schema
 
 # -------------------------
 # Fonctions d'inspection

From b9414c9a09b4f2fd11505fafcd89dbe7ee406341 Mon Sep 17 00:00:00 2001
From: alexpetit <petit.alexandre22@gmail.com>
Date: Sun, 22 Mar 2026 17:03:53 +0100
Subject: [PATCH 4/5] Changement de l'API + ajout de postgre

---
 biolit/export_api.py     | 152 ++++++++++++-----------------
 biolit/postgres.py       | 124 +++++++++++++++++++++++
 ml/yolov8_DINO/README.md |   2 +-
 pipelines/run.py         |  22 +++++
 pyproject.toml           |   3 +
 tests/test_export_api.py | 206 ++++++++++++++++++++++++++++-----------
 uv.lock                  |  90 +++++++++++++++++
 7 files changed, 452 insertions(+), 147 deletions(-)
 create mode 100644 biolit/postgres.py
 create mode 100644 pipelines/run.py

diff --git a/biolit/export_api.py b/biolit/export_api.py
index f3a99a0..6c4a510 100644
--- a/biolit/export_api.py
+++ b/biolit/export_api.py
@@ -1,117 +1,89 @@
 import requests
 import polars as pl
 import structlog
+import re
+import os
 
 LOGGER = structlog.get_logger()
 
 # ------------------------------
-# Helper pour récupérer une clé dans meta
+# FETCH API
 # ------------------------------
-def get_meta(meta: dict, key: str):
-    """Retourne la première valeur d'une clé meta, ou None si absente"""
-    if not meta:
-        return None
-    value = meta.get(key)
-    if isinstance(value, list) and value:
-        return value[0]
-    return value
+def fetch_biolit_from_api():
+
+    url = os.getenv("BIOLIT_API_URL")
+
+    response = requests.get(url)
+    response.raise_for_status()
 
+    data = response.json()
+
+    print(f"{len(data)} observations récupérées")
+    return data
 
 # ------------------------------
-# FETCH API
+# RENAME OF COLUMNS
 # ------------------------------
-def fetch_biolit_from_api(per_page=100, max_pages=5):
-    """
-    Récupère les observations depuis l'API Biolit.
-    Limite par défaut à max_pages pour éviter les 150+ pages.
-    """
-    url_base = "https://biolit.fr/wp-json/biolitapi/v1/observations"
-    all_data = []
-
-    for page in range(1, max_pages + 1):
-        url = f"{url_base}?per_page={per_page}&page={page}"
-        LOGGER.info(f"Fetching page {page} from API")
-        response = requests.get(url)
-        response.raise_for_status()
-        data = response.json()
-        if not data:
-            break
-        all_data.extend(data)
-
-    LOGGER.info(f"Fetched {len(all_data)} observations total")
-    return all_data
+
+
+def normalize_column_name(col: str) -> str:
+    """Convertit les noms API en snake_case propre FR"""
+    col = col.lower()
+    col = col.replace("-", "_")
+    col = col.replace(" ", "_")
+    col = col.replace("é", "e").replace("è", "e").replace("à", "a")
+    col = col.replace("ù", "u").replace("ô", "o")
+    col = re.sub(r"[^a-z0-9_]", "", col)
+    return col
+
+
+COLUMN_MAPPING = {
+    "id": "id_observation",
+    "date": "date_observation",
+    "link": "lien_observation",
+    "author": "observateur",
+    "_url_sortie": "url_sortie",
+    "espece-identifiee": "espece_identifiee",
+    "heure-debut": "heure_debut",
+    "heure-fin": "heure_fin",
+    "latitude": "latitude",
+    "longitude": "longitude",
+    "photos": "photos",
+    "relais": "relais",
+    "espece_id": "id_espece",
+    "espece": "nom_scientifique",
+    "common": "nom_commun",
+    "categorie-programme": "categorie_programme",
+    "programme": "programme",
+}
 
 
 # ------------------------------
 # ADAPT API -> PARQUET
 # ------------------------------
-def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame:
-    """
-    Transforme la structure API Biolit en DataFrame pour parquet.
-    """
+def adapt_api_to_dataframe(data: list) -> pl.DataFrame:
     rows = []
 
     for item in data:
-        obs = item.get("observation", {})
-        meta = obs.get("meta", {})
-        parents = item.get("parents", {})
-        especes = item.get("especes", [])
-
-        quadra = parents.get("quadra", {})
-        abb = parents.get("abb", {})
-        abb_meta = abb.get("meta", {})
-
-        # Gestion des espèces
-        nom_scientifique = None
-        nom_commun = None
-        nombre_mollusques = None
-
-        if especes:
-            nom_scientifique = especes[0].get("nom")
-            nombre_mollusques = especes[0].get("nombre_presents")
-
-        row = {
-            # Niveau N1 (Quadra)
-            "protocole": get_meta(meta, "jet_tax__protocole"),
-            "ID - N1": quadra.get("ID"),
-            "titre - N1": quadra.get("title"),
-            "lien - N1": get_meta(meta, "_url_sortie"),
-            "auteur - N1": None,
-            "images - N1": None,
-            "date - N1": obs.get("date"),
-            "heure-de-debut - N1": get_meta(meta, "heure-debut"),
-            "heure-de-fin - N1": get_meta(meta, "heure-fin"),
-            "latitude - N1": get_meta(meta, "latitude"),
-            "longitude - N1": get_meta(meta, "longitude"),
-            "relais-local - N1": get_meta(abb_meta, "relais-local"),
-            "nom du lieu - N1": get_meta(abb_meta, "nom-du-lieu-abb"),
-
-            # Observation
-            "ID - observation": obs.get("ID"),
-            "titre - observation": obs.get("title"),
-            "lien - observation": obs.get("link"),
-            "Nom scientifique - observation": nom_scientifique,
-            "Nom commun - observation": nom_commun,
-            "programme espèce": get_meta(meta, "jet_tax__categorie-programme"),
-            "images - observation": obs.get("images"),
-            "nombre de mollusques - observation": nombre_mollusques,
-            "validee - observation": get_meta(meta, "validee"),
-            "espece identifiable ? - observation": get_meta(meta, "espece-identifiee"),
-        }
-
-        rows.append(row)
-
-    return pl.DataFrame(rows)
+        new_row = {}
+
+        for key, value in item.items():
+            # mapping si connu, sinon normalisation auto
+            new_key = COLUMN_MAPPING.get(key, normalize_column_name(key))
+            new_row[new_key] = value
+
+        rows.append(new_row)
+
+    df = pl.DataFrame(rows)
+
+    return df
 
 
 # ------------------------------
 # LOAD (Fetch + Adapt)
 # ------------------------------
-def load_biolit_from_api(per_page=100, max_pages=5) -> pl.DataFrame:
-    """
-    Récupère et transforme les données Biolit depuis l'API.
-    """
-    raw_data = fetch_biolit_from_api(per_page=per_page, max_pages=max_pages)
-    df = adapt_api_to_parquet_schema(raw_data)
+def load_biolit_from_api() -> pl.DataFrame:
+    raw_data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(raw_data)
     return df
 
diff --git a/biolit/postgres.py b/biolit/postgres.py
new file mode 100644
index 0000000..247df20
--- /dev/null
+++ b/biolit/postgres.py
@@ -0,0 +1,124 @@
+import os
+import polars as pl
+from sqlalchemy import create_engine, text
+
+
+# -------------------------
+# Connexion DB
+# -------------------------
+
+def get_engine():
+    postgres_url = os.getenv("POSTGRES_URL")
+
+    if not postgres_url:
+        raise ValueError("Missing POSTGRES_URL")
+
+    return create_engine(postgres_url)
+
+
+# -------------------------
+# Préparation des données
+# -------------------------
+def prepare_dataframe_for_postgres(df: pl.DataFrame) -> pl.DataFrame:
+    return df.with_columns([
+
+        # -------------------------
+        # IDs
+        # -------------------------
+        pl.col("id_observation")
+        .cast(pl.Int64),
+
+        pl.col("id_espece")
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        pl.col("categorie_programme")
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        pl.col("relais")
+        .cast(pl.Utf8)
+        .replace("", None)
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        # -------------------------
+        # Coordonnées
+        # -------------------------
+        pl.col("latitude")
+        .cast(pl.Utf8)
+        .str.strip_chars()
+        .cast(pl.Float64, strict=False),
+
+        pl.col("longitude")
+        .cast(pl.Utf8)
+        .str.strip_chars()
+        .cast(pl.Float64, strict=False),
+
+        # -------------------------
+        # Dates
+        # -------------------------
+        pl.col("date_observation")
+        .str.strptime(pl.Datetime, strict=False),
+
+        pl.col("heure_debut")
+        .str.strptime(pl.Time, strict=False),
+
+        pl.col("heure_fin")
+        .str.strptime(pl.Time, strict=False),
+    ])
+
+# -------------------------
+# Insert avec sécurité (UPSERT)
+# -------------------------
+
+def insert_dataframe(df: pl.DataFrame):
+    engine = get_engine()
+
+    rows = df.to_dicts()
+
+    with engine.begin() as conn:
+        for row in rows:
+            conn.execute(text("""
+                INSERT INTO observations (
+                    id_observation,
+                    date_observation,
+                    lien_observation,
+                    observateur,
+                    url_sortie,
+                    espece_identifiee,
+                    heure_debut,
+                    heure_fin,
+                    latitude,
+                    longitude,
+                    photos,
+                    relais,
+                    id_espece,
+                    nom_scientifique,
+                    nom_commun,
+                    categorie_programme,
+                    programme
+                ) VALUES (
+                    :id_observation,
+                    :date_observation,
+                    :lien_observation,
+                    :observateur,
+                    :url_sortie,
+                    :espece_identifiee,
+                    :heure_debut,
+                    :heure_fin,
+                    :latitude,
+                    :longitude,
+                    :photos,
+                    :relais,
+                    :id_espece,
+                    :nom_scientifique,
+                    :nom_commun,
+                    :categorie_programme,
+                    :programme
+                )
+                ON CONFLICT (id_observation) DO NOTHING
+            """), row)
\ No newline at end of file
diff --git a/ml/yolov8_DINO/README.md b/ml/yolov8_DINO/README.md
index 64781c8..7005c4b 100644
--- a/ml/yolov8_DINO/README.md
+++ b/ml/yolov8_DINO/README.md
@@ -16,7 +16,7 @@ Pipeline de constitution du dataset Biolit pour inférence YOLO / Grounding DINO
     ├── images/
     │   ├── identifiable/
     │   └── non_identifiable/ # à valider
-    ├── labels/    
+    ├── labels/
     │   ├── identifiable/
     │   └── non_identifiable/ # à valider
     ├── metadata.csv # GroundingDINO
diff --git a/pipelines/run.py b/pipelines/run.py
new file mode 100644
index 0000000..e63d662
--- /dev/null
+++ b/pipelines/run.py
@@ -0,0 +1,22 @@
+from biolit.export_api import fetch_biolit_from_api, adapt_api_to_dataframe
+from biolit.postgres import prepare_dataframe_for_postgres, insert_dataframe
+
+
+def run_pipeline():
+    print("Fetching data...")
+    data = fetch_biolit_from_api()
+
+    print("Transforming...")
+    df = adapt_api_to_dataframe(data)
+
+    print("Preparing for Postgres...")
+    df = prepare_dataframe_for_postgres(df)
+
+    print("Loading into Postgres...")
+    insert_dataframe(df)
+
+    print("DONE ✅")
+
+
+if __name__ == "__main__":
+    run_pipeline()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 2536fd8..9df7cb0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,9 +21,12 @@ dependencies = [
     "plotly>=6.5.0",
     "polars>=1.36.1",
     "pre-commit>=4.5.1",
+    "psycopg2-binary>=2.9.11",
+    "pyarrow>=23.0.1",
     "pytest>=9.0.2",
     "requests>=2.32.3",
     "ruff>=0.14.10",
+    "sqlalchemy>=2.0.44",
     "structlog>=25.5.0",
     "tqdm>=4.67.3",
     "transformers>=5.2.0",
diff --git a/tests/test_export_api.py b/tests/test_export_api.py
index 8560417..19ca0f0 100644
--- a/tests/test_export_api.py
+++ b/tests/test_export_api.py
@@ -1,69 +1,163 @@
+from biolit.export_api import fetch_biolit_from_api, adapt_api_to_dataframe
+
+# -------------------------
+# Tests API
+# -------------------------
+
+def test_fetch_api_returns_data():
+    """Vérifie que l'API retourne bien des données"""
+    data = fetch_biolit_from_api()
+
+    assert isinstance(data, list)
+    assert len(data) > 0
+
+    print(f"\n✅ {len(data)} observations récupérées")
+
+
+def test_fetch_api_structure():
+    """Vérifie la structure des données API"""
+    data = fetch_biolit_from_api()
+    sample = data[0]
+
+    expected_keys = {
+        "id",
+        "date",
+        "link",
+        "author",
+        "_url_sortie",
+        "espece-identifiee",
+        "heure-debut",
+        "heure-fin",
+        "latitude",
+        "longitude",
+        "photos",
+        "relais",
+        "espece_id",
+        "espece",
+        "common",
+    }
+
+    missing_keys = expected_keys - set(sample.keys())
+
+    assert len(missing_keys) == 0, f"Champs manquants: {missing_keys}"
+
+    print("\n✅ Structure API valide")
+
+
+# -------------------------
+# Tests transformation
+# -------------------------
+
+def test_adapt_to_dataframe():
+    """Vérifie la transformation en DataFrame"""
+    data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(data)
+
+    assert df.shape[0] > 0
+    assert df.shape[1] > 0
+
+    print(f"\n✅ DataFrame: {df.shape[0]} lignes, {df.shape[1]} colonnes")
+
+
+def test_expected_columns_present():
+    """Vérifie les colonnes critiques"""
+    data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(data)
+
+    expected_columns = {
+        "id_observation",
+        "date_observation",
+        "nom_scientifique",
+        "nom_commun",
+        "latitude",
+        "longitude",
+    }
+
+    missing = expected_columns - set(df.columns)
+
+    assert len(missing) == 0, f"Colonnes manquantes: {missing}"
+
+    print("\n✅ Colonnes critiques présentes")
 
-from biolit.export_api import fetch_biolit_from_api, adapt_api_to_parquet_schema
 
 # -------------------------
-# Fonctions d'inspection
+# Tests qualité des données
 # -------------------------
 
-def inspect_api_structure(raw_data):
-    """Affiche les clés du niveau supérieur dans l'API"""
-    keys = set()
-    for item in raw_data:
-        keys.update(item.keys())
-    print("TOP LEVEL KEYS:")
-    for k in sorted(keys):
-        print("-", k)
-
-def inspect_meta_keys(raw_data):
-    """Affiche toutes les clés présentes dans le champ 'meta' de chaque observation"""
-    meta_keys = set()
-    for item in raw_data:
-        if "observation" not in item:
-            continue
-        meta = item["observation"].get("meta", {})
-        meta_keys.update(meta.keys())
-    print("META KEYS:")
-    for k in sorted(meta_keys):
-        print("-", k)
-
-def inspect_meta_values(raw_data, field):
-    """Affiche les valeurs et leur quantité pour un champ du meta"""
-    values = {}
-    for item in raw_data:
-        if "observation" not in item:
-            continue
-        meta = item["observation"].get("meta", {})
-        val = meta.get(field)
-        if isinstance(val, list) and val:
-            val = val[0]
-        values[val] = values.get(val, 0) + 1
-    print(f"\nVALUES FOR {field}:")
-    for k, v in values.items():
-        print(k, ":", v)
+def test_unique_ids():
+    """Vérifie qu'il n'y a pas de doublons"""
+    data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(data)
+
+    total = df.shape[0]
+    unique = df.select("id_observation").n_unique()
+
+    assert total == unique, "Doublons détectés sur id_observation"
+
+    print("\n✅ Pas de doublons")
+
+
+def test_no_null_coordinates():
+    """Vérifie que les coordonnées sont présentes"""
+    data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(data)
+
+    null_lat = df.filter(df["latitude"].is_null()).shape[0]
+    null_lon = df.filter(df["longitude"].is_null()).shape[0]
+
+    print(f"\nNull latitude: {null_lat}")
+    print(f"Null longitude: {null_lon}")
+
+    # tolérance possible, donc pas assert strict
+    assert null_lat < df.shape[0]
+    assert null_lon < df.shape[0]
+
+
+def test_id_is_numeric():
+    """Vérifie que les IDs sont bien numériques"""
+    data = fetch_biolit_from_api()
+
+    ids = [item["id"] for item in data]
+
+    # doit pouvoir être cast en int
+    for i in ids[:100]:  # test sur sample
+        int(i)
+
+    print("\n✅ IDs valides")
+
 
 # -------------------------
-# Tests
+# Test global pipeline
 # -------------------------
 
-def test_fetch_small_sample():
-    """Test fetch API avec un petit nombre de pages pour debug"""
-    raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5)  # petit sample
-    assert isinstance(raw_data, list)
-    print(f"\nNombre total d'éléments récupérés: {len(raw_data)}")
-    inspect_api_structure(raw_data)
-    inspect_meta_keys(raw_data)
-    inspect_meta_values(raw_data, "validee")
+def test_full_pipeline():
+    """Test end-to-end"""
+    data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(data)
+
+    assert df.shape[0] > 0
 
-def test_load_and_adapt():
-    """Test l'adaptation de l'API vers le schéma parquet"""
-    raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5)
-    df = adapt_api_to_parquet_schema(raw_data)
-    print(f"\nDataframe adapté avec {len(df)} lignes")
-    print(df.head(10))  # affiche les 10 premières lignes pour vérification
+    print("\n=== PIPELINE OK ===")
+    print(df.head(5))
 
 
+# -------------------------
+# Execution directe
+# -------------------------
+
 if __name__ == "__main__":
-    print("=== TEST FETCH SMALL SAMPLE ===")
-    test_fetch_small_sample()
-    print("\n=== TEST LOAD AND ADAPT ===")
-    test_load_and_adapt()
\ No newline at end of file
+    print("=== TEST FETCH API ===")
+    test_fetch_api_returns_data()
+    test_fetch_api_structure()
+
+    print("\n=== TEST TRANSFORMATION ===")
+    test_adapt_to_dataframe()
+    test_expected_columns_present()
+
+    print("\n=== TEST QUALITE ===")
+    test_unique_ids()
+    test_no_null_coordinates()
+    test_id_is_numeric()
+
+    print("\n=== TEST PIPELINE COMPLET ===")
+    test_full_pipeline()
\ No newline at end of file
diff --git a/uv.lock b/uv.lock
index 91483cd..a43bb38 100644
--- a/uv.lock
+++ b/uv.lock
@@ -27,9 +27,12 @@ dependencies = [
     { name = "plotly" },
     { name = "polars" },
     { name = "pre-commit" },
+    { name = "psycopg2-binary" },
+    { name = "pyarrow" },
     { name = "pytest" },
     { name = "requests" },
     { name = "ruff" },
+    { name = "sqlalchemy" },
     { name = "structlog" },
     { name = "tqdm" },
     { name = "transformers" },
@@ -53,9 +56,12 @@ requires-dist = [
     { name = "plotly", specifier = ">=6.5.0" },
     { name = "polars", specifier = ">=1.36.1" },
     { name = "pre-commit", specifier = ">=4.5.1" },
+    { name = "psycopg2-binary", specifier = ">=2.9.11" },
+    { name = "pyarrow", specifier = ">=23.0.1" },
     { name = "pytest", specifier = ">=9.0.2" },
     { name = "requests", specifier = ">=2.32.3" },
     { name = "ruff", specifier = ">=0.14.10" },
+    { name = "sqlalchemy", specifier = ">=2.0.44" },
     { name = "structlog", specifier = ">=25.5.0" },
     { name = "tqdm", specifier = ">=4.67.3" },
     { name = "transformers", specifier = ">=5.2.0" },
@@ -2053,6 +2059,47 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" },
 ]
 
+[[package]]
+name = "psycopg2-binary"
+version = "2.9.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ac/6c/8767aaa597ba424643dc87348c6f1754dd9f48e80fdc1b9f7ca5c3a7c213/psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c", size = 379620, upload-time = "2025-10-10T11:14:48.041Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4", size = 3756603, upload-time = "2025-10-10T11:11:52.213Z" },
+    { url = "https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc", size = 3864509, upload-time = "2025-10-10T11:11:56.452Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a", size = 4411159, upload-time = "2025-10-10T11:12:00.49Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/a0/567f7ea38b6e1c62aafd58375665a547c00c608a471620c0edc364733e13/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e", size = 4468234, upload-time = "2025-10-10T11:12:04.892Z" },
+    { url = "https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db", size = 4166236, upload-time = "2025-10-10T11:12:11.674Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/94/c1777c355bc560992af848d98216148be5f1be001af06e06fc49cbded578/psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757", size = 3983083, upload-time = "2025-10-30T02:55:15.73Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/42/c9a21edf0e3daa7825ed04a4a8588686c6c14904344344a039556d78aa58/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3", size = 3652281, upload-time = "2025-10-10T11:12:17.713Z" },
+    { url = "https://files.pythonhosted.org/packages/12/22/dedfbcfa97917982301496b6b5e5e6c5531d1f35dd2b488b08d1ebc52482/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a", size = 3298010, upload-time = "2025-10-10T11:12:22.671Z" },
+    { url = "https://files.pythonhosted.org/packages/66/ea/d3390e6696276078bd01b2ece417deac954dfdd552d2edc3d03204416c0c/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34", size = 3044641, upload-time = "2025-10-30T02:55:19.929Z" },
+    { url = "https://files.pythonhosted.org/packages/12/9a/0402ded6cbd321da0c0ba7d34dc12b29b14f5764c2fc10750daa38e825fc/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d", size = 3347940, upload-time = "2025-10-10T11:12:26.529Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d", size = 2714147, upload-time = "2025-10-10T11:12:29.535Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/a8/a2709681b3ac11b0b1786def10006b8995125ba268c9a54bea6f5ae8bd3e/psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c", size = 3756572, upload-time = "2025-10-10T11:12:32.873Z" },
+    { url = "https://files.pythonhosted.org/packages/62/e1/c2b38d256d0dafd32713e9f31982a5b028f4a3651f446be70785f484f472/psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee", size = 3864529, upload-time = "2025-10-10T11:12:36.791Z" },
+    { url = "https://files.pythonhosted.org/packages/11/32/b2ffe8f3853c181e88f0a157c5fb4e383102238d73c52ac6d93a5c8bffe6/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0", size = 4411242, upload-time = "2025-10-10T11:12:42.388Z" },
+    { url = "https://files.pythonhosted.org/packages/10/04/6ca7477e6160ae258dc96f67c371157776564679aefd247b66f4661501a2/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766", size = 4468258, upload-time = "2025-10-10T11:12:48.654Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/7e/6a1a38f86412df101435809f225d57c1a021307dd0689f7a5e7fe83588b1/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3", size = 4166295, upload-time = "2025-10-10T11:12:52.525Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/7d/c07374c501b45f3579a9eb761cbf2604ddef3d96ad48679112c2c5aa9c25/psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f", size = 3983133, upload-time = "2025-10-30T02:55:24.329Z" },
+    { url = "https://files.pythonhosted.org/packages/82/56/993b7104cb8345ad7d4516538ccf8f0d0ac640b1ebd8c754a7b024e76878/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4", size = 3652383, upload-time = "2025-10-10T11:12:56.387Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/ac/eaeb6029362fd8d454a27374d84c6866c82c33bfc24587b4face5a8e43ef/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c", size = 3298168, upload-time = "2025-10-10T11:13:00.403Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/39/50c3facc66bded9ada5cbc0de867499a703dc6bca6be03070b4e3b65da6c/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60", size = 3044712, upload-time = "2025-10-30T02:55:27.975Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/8e/b7de019a1f562f72ada81081a12823d3c1590bedc48d7d2559410a2763fe/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1", size = 3347549, upload-time = "2025-10-10T11:13:03.971Z" },
+    { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" },
+    { url = "https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1", size = 3756567, upload-time = "2025-10-10T11:13:11.885Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f", size = 3864755, upload-time = "2025-10-10T11:13:17.727Z" },
+    { url = "https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5", size = 4411646, upload-time = "2025-10-10T11:13:24.432Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/bd/a335ce6645334fb8d758cc358810defca14a1d19ffbc8a10bd38a2328565/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8", size = 4468701, upload-time = "2025-10-10T11:13:29.266Z" },
+    { url = "https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c", size = 4166293, upload-time = "2025-10-10T11:13:33.336Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/e0/f8cc36eadd1b716ab36bb290618a3292e009867e5c97ce4aba908cb99644/psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f", size = 3983184, upload-time = "2025-10-30T02:55:32.483Z" },
+    { url = "https://files.pythonhosted.org/packages/53/3e/2a8fe18a4e61cfb3417da67b6318e12691772c0696d79434184a511906dc/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747", size = 3652650, upload-time = "2025-10-10T11:13:38.181Z" },
+    { url = "https://files.pythonhosted.org/packages/76/36/03801461b31b29fe58d228c24388f999fe814dfc302856e0d17f97d7c54d/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f", size = 3298663, upload-time = "2025-10-10T11:13:44.878Z" },
+    { url = "https://files.pythonhosted.org/packages/97/77/21b0ea2e1a73aa5fa9222b2a6b8ba325c43c3a8d54272839c991f2345656/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b", size = 3044737, upload-time = "2025-10-30T02:55:35.69Z" },
+    { url = "https://files.pythonhosted.org/packages/67/69/f36abe5f118c1dca6d3726ceae164b9356985805480731ac6712a63f24f0/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d", size = 3347643, upload-time = "2025-10-10T11:13:53.499Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316", size = 2803913, upload-time = "2025-10-10T11:13:57.058Z" },
+]
+
 [[package]]
 name = "ptyprocess"
 version = "0.7.0"
@@ -2071,6 +2118,49 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" },
 ]
 
+[[package]]
+name = "pyarrow"
+version = "23.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
+    { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" },
+    { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" },
+    { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" },
+    { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" },
+    { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" },
+    { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" },
+    { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" },
+    { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" },
+    { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" },
+    { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" },
+    { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" },
+    { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" },
+]
+
 [[package]]
 name = "pycparser"
 version = "2.23"

From d38fac8713769ce54a4bf2fa6636d9312e4e0a8e Mon Sep 17 00:00:00 2001
From: alexpetit <petit.alexandre22@gmail.com>
Date: Sun, 22 Mar 2026 17:26:27 +0100
Subject: [PATCH 5/5] Add README

---
 pipelines/README.md | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/pipelines/README.md b/pipelines/README.md
index a568cb6..e0fbc58 100644
--- a/pipelines/README.md
+++ b/pipelines/README.md
@@ -5,7 +5,30 @@ Les pipelines utilisent le dossier `data/` (non versionné) comme espace de trav
 
 ## Flux quotidien (API → ML → Label Studio)
 
-1. **Récupération quotidienne** depuis l'API (à venir) ou CSV local.
+1. **Ingestion & préparation des données** depuis l'API.
+   1. **Objectifs**
+      - Récupérer les données depuis l’API Biolit
+      - Standardiser et nettoyer les données
+      - Les stocker dans une base PostgreSQL
+      - Mettre les données à disposition des autres systèmes (ML, dataviz)
+   2. **Étapes du pipeline**
+      1. Ingestion
+      - appel à l’API Biolit
+      - récupération des observations
+
+      2. Transformation
+      - normalisation des noms de colonnes
+      - typage des champs
+      - nettoyage des données (dates, coordonnées, identifiants)
+
+      3. Chargement
+      - insertion dans PostgreSQL
+      - gestion des doublons via un mécanisme d’UPSERT (ON CONFLICT DO NOTHING)
+   3. **Variables d'environnement**
+      - POSTGRES_URL=postgresql://user:password@host:port/dbname
+      - BIOLIT_API_URL=https://biolit.fr/wp-json/biolit/v1/observations?token=XXX
+   4. **Lancer la pipeline**
+      - uv run python -m pipelines.run
 2. **Qualité** : si l'image est mauvaise → stop.
 3. **YOLOv8** : détection + crop.
    - si aucune détection → **Label Studio (CROP)**