dataforgoodfr · Hpoinseaux · Apr 3, 2026 · Mar 1, 2026 · Mar 15, 2026 · Mar 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -171,4 +171,15 @@ images/
 labels/
 failed_downloads.csv
 metadata.csv
-data.yaml
+data.yaml
+
+# orchestrateur - tests Timothée
+orchestrateur/
+flows/
+images/
+labeled-images/
+_non_images_tmp/
+yolov8n.pt
+images_test/
+labeled_test/
+runs/
diff --git a/biolit/export_api.py b/biolit/export_api.py
@@ -0,0 +1,89 @@
+import requests
+import polars as pl
+import structlog
+import re
+import os
+
+LOGGER = structlog.get_logger()
+
+# ------------------------------
+# FETCH API
+# ------------------------------
+def fetch_biolit_from_api():
+
+    url = os.getenv("BIOLIT_API_URL")
+
+    response = requests.get(url)
+    response.raise_for_status()
+
+    data = response.json()
+
+    print(f"{len(data)} observations récupérées")
+    return data
+
+# ------------------------------
+# RENAME OF COLUMNS
+# ------------------------------
+
+
+def normalize_column_name(col: str) -> str:
+    """Convertit les noms API en snake_case propre FR"""
+    col = col.lower()
+    col = col.replace("-", "_")
+    col = col.replace(" ", "_")
+    col = col.replace("é", "e").replace("è", "e").replace("à", "a")
+    col = col.replace("ù", "u").replace("ô", "o")
+    col = re.sub(r"[^a-z0-9_]", "", col)
+    return col
+
+
+COLUMN_MAPPING = {
+    "id": "id_observation",
+    "date": "date_observation",
+    "link": "lien_observation",
+    "author": "observateur",
+    "_url_sortie": "url_sortie",
+    "espece-identifiee": "espece_identifiee",
+    "heure-debut": "heure_debut",
+    "heure-fin": "heure_fin",
+    "latitude": "latitude",
+    "longitude": "longitude",
+    "photos": "photos",
+    "relais": "relais",
+    "espece_id": "id_espece",
+    "espece": "nom_scientifique",
+    "common": "nom_commun",
+    "categorie-programme": "categorie_programme",
+    "programme": "programme",
+}
+
+
+# ------------------------------
+# ADAPT API -> PARQUET
+# ------------------------------
+def adapt_api_to_dataframe(data: list) -> pl.DataFrame:
+    rows = []
+
+    for item in data:
+        new_row = {}
+
+        for key, value in item.items():
+            # mapping si connu, sinon normalisation auto
+            new_key = COLUMN_MAPPING.get(key, normalize_column_name(key))
+            new_row[new_key] = value
+
+        rows.append(new_row)
+
+    df = pl.DataFrame(rows)
+
+    return df
+
+
+# ------------------------------
+# LOAD (Fetch + Adapt)
+# ------------------------------
+def load_biolit_from_api() -> pl.DataFrame:
+    raw_data = fetch_biolit_from_api()
+    df = adapt_api_to_dataframe(raw_data)
+    return df
+
diff --git a/biolit/lien_doris.py b/biolit/lien_doris.py
@@ -0,0 +1,58 @@
+import requests
+import time
+from bs4 import BeautifulSoup
+import polars as pl
+
+import structlog
+
+from biolit import DATADIR
+
+LOGGER = structlog.get_logger()
+
+
+def scrapping_site_lien_doris() -> pl.DataFrame:
+    offset = 0
+    lien_doris_all_data = []
+
+    while True:
+        url = f"https://doris.ffessm.fr/find/species/(offset)/{offset}/(state)/*/(sortby)/recent/(manualSort)/1/(view)/list"
+        LOGGER.info(f"Scraping offset = {offset}")
+
+        try:
+            response = requests.get(url, timeout=10)
+
+            if response.status_code != 200:
+                LOGGER.info(f"Erreur à l'offset : {offset}")
+                break
+
+            soup = BeautifulSoup(response.text, "html.parser")
+            species = soup.find_all("div", class_="specieSearchResult resultLine")
+            if not species:
+                LOGGER.info("Fin des pages.")
+                break
+
+            lien_doris_page_data = []
+            for specie in species:
+                try:
+                    a_tag = specie.find('a', href=True)
+                    lien_doris = a_tag.get("href")
+                    nom_scientifique = a_tag.find("em").get_text(strip=True)
+                    lien_doris_page_data.append({
+                        "nom_scientifique": nom_scientifique,
+                        "lien_doris": lien_doris,
+                    })
+                except Exception as e:
+                    LOGGER.info(f"Erreur parsing espèce : {e}")
+                    continue
+            lien_doris_all_data.extend(lien_doris_page_data)
+            offset += len(lien_doris_page_data)
+
+            df = pl.DataFrame(lien_doris_all_data)
+            df.write_csv(DATADIR / "doris_data.csv")
+            time.sleep(1)
+
+        except Exception as e:
+            LOGGER.info(f"Erreur requête : {e}")
+            break
+
+    return pl.DataFrame(lien_doris_all_data)
diff --git a/biolit/postgres.py b/biolit/postgres.py
@@ -0,0 +1,124 @@
+import os
+import polars as pl
+from sqlalchemy import create_engine, text
+
+
+# -------------------------
+# Connexion DB
+# -------------------------
+
+def get_engine():
+    postgres_url = os.getenv("POSTGRES_URL")
+
+    if not postgres_url:
+        raise ValueError("Missing POSTGRES_URL")
+
+    return create_engine(postgres_url)
+
+
+# -------------------------
+# Préparation des données
+# -------------------------
+def prepare_dataframe_for_postgres(df: pl.DataFrame) -> pl.DataFrame:
+    return df.with_columns([
+
+        # -------------------------
+        # IDs
+        # -------------------------
+        pl.col("id_observation")
+        .cast(pl.Int64),
+
+        pl.col("id_espece")
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        pl.col("categorie_programme")
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        pl.col("relais")
+        .cast(pl.Utf8)
+        .replace("", None)
+        .cast(pl.Float64, strict=False)
+        .fill_nan(None)
+        .cast(pl.Int64, strict=False),
+
+        # -------------------------
+        # Coordonnées
+        # -------------------------
+        pl.col("latitude")
+        .cast(pl.Utf8)
+        .str.strip_chars()
+        .cast(pl.Float64, strict=False),
+
+        pl.col("longitude")
+        .cast(pl.Utf8)
+        .str.strip_chars()
+        .cast(pl.Float64, strict=False),
+
+        # -------------------------
+        # Dates
+        # -------------------------
+        pl.col("date_observation")
+        .str.strptime(pl.Datetime, strict=False),
+
+        pl.col("heure_debut")
+        .str.strptime(pl.Time, strict=False),
+
+        pl.col("heure_fin")
+        .str.strptime(pl.Time, strict=False),
+    ])
+
+# -------------------------
+# Insert avec sécurité (UPSERT)
+# -------------------------
+
+def insert_dataframe(df: pl.DataFrame):
+    engine = get_engine()
+
+    rows = df.to_dicts()
+
+    with engine.begin() as conn:
+        for row in rows:
+            conn.execute(text("""
+                INSERT INTO observations (
+                    id_observation,
+                    date_observation,
+                    lien_observation,
+                    observateur,
+                    url_sortie,
+                    espece_identifiee,
+                    heure_debut,
+                    heure_fin,
+                    latitude,
+                    longitude,
+                    photos,
+                    relais,
+                    id_espece,
+                    nom_scientifique,
+                    nom_commun,
+                    categorie_programme,
+                    programme
+                ) VALUES (
+                    :id_observation,
+                    :date_observation,
+                    :lien_observation,
+                    :observateur,
+                    :url_sortie,
+                    :espece_identifiee,
+                    :heure_debut,
+                    :heure_fin,
+                    :latitude,
+                    :longitude,
+                    :photos,
+                    :relais,
+                    :id_espece,
+                    :nom_scientifique,
+                    :nom_commun,
+                    :categorie_programme,
+                    :programme
+                )
+                ON CONFLICT (id_observation) DO NOTHING
+            """), row)
diff --git a/ml/yolov8_DINO/README.md b/ml/yolov8_DINO/README.md
@@ -1,36 +1,41 @@
-# ML - YOLOv8 (détection + crop)
+# ML - YOLOv8 / GroundingDINO
 
-Objectif : détecter l'espèce (végétal/animal) et générer un crop centré sur l'objet.
+Approche en deux temps : un premier entraînement sur ~10 000 images annotées automatiquement (quantitatif)
+pour bootstrapper le modèle, suivi d'un re-entraînement sur ~1 400 images cropées et annotées manuellement (qualitatif)
+pour affiner les performances. L'annotation manuelle ne porte que sur la deuxième partie.
 
-## Entrées
-Structure de données généré à partir du fichier `export_biolit.csv`.
+## Partie 1 — Bootstrap autodistill
 
-### Récupération propre des données
-build_dataset.py
-Pipeline de constitution du dataset Biolit pour inférence YOLO / Grounding DINO.
+Annotation automatique via GroundingDINO + fine-tuning YOLOv8. Pas d'annotation manuelle.
+L'ontologie est à affiner dans `configs/autodistill_boostrap.yaml`.
+
+```bash
+python build_dataset.py      # téléchargement + nettoyage images
+python check_dataset.py      # vérification qualité (résolutions, espèces, corrompues)
+python autodistill_label.py  # pseudo-labels GroundingDINO
+python autodistill_train.py  # fine-tuning YOLOv8
+```
+
+`--limit N` sur `build_dataset.py` pour tester sur un sous-ensemble.
 
 **Structure de sortie :**
 
 ```text
-    dataset_biolit/
-    ├── images/
-    │   ├── identifiable/
-    │   └── non_identifiable/ # à valider
-    ├── labels/
-    │   ├── identifiable/
-    │   └── non_identifiable/ # à valider
-    ├── metadata.csv # GroundingDINO
-    └── data.yaml #YOLO
+dataset_biolit/
+├── images/
+└── labeled-images/
+    ├── train/
+    │   ├── images/
+    │   └── labels/
+    ├── valid/
+    │   ├── images/
+    │   └── labels/
+    └── data.yaml
 ```
 
+Poids entraînés → `runs/biolit_v2_bootstrap/weights/`
 
-## Sorties
-
-- Bboxes + classes : `dataset_biolit/exports/yolov8_detections.csv`
-- Images crops : `dataset_biolit/crops/images/`
-
-## Routage
+## Partie 2 — Fine-tuning (à venir)
 
-- si détection forte → **Classification**
-- si détection faible → **Label Studio (CROP)**
-- si pas de détection animal ou végétal → stop
+Pris en charge par un autre membre de l'équipe.
+Entraînement sur des images cropées et annotées manuellement pour améliorer les performances du modèle bootstrap.