From b6a752261a916ce1948d87d8e350ede2ff28ea85 Mon Sep 17 00:00:00 2001 From: alexpetit Date: Sun, 1 Mar 2026 20:07:06 +0100 Subject: [PATCH 1/5] feat: ajout des fonctions pour l'ingestion API --- biolit/export_api.py | 65 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 biolit/export_api.py diff --git a/biolit/export_api.py b/biolit/export_api.py new file mode 100644 index 0000000..0697ba0 --- /dev/null +++ b/biolit/export_api.py @@ -0,0 +1,65 @@ +import polars as pl +import requests + +###Test export from API + +def fetch_biolit_api(per_page: int = 1000): + all_data = [] + page = 1 + print("Téléchargement des données depuis l'API Biolit...") + while True: + url = f"https://biolit.fr/wp-json/biolitapi/v1/observations/all?per_page={per_page}&page={page}" + r = requests.get(url) + data = r.json() + if not data: + break + all_data.extend(data) + page += 1 + print(f"Page {page} téléchargée, total observations : {len(all_data)}") + + if not all_data: + return pl.DataFrame([]) + df = pl.DataFrame(all_data) + print(df.head()) + print(df.shape) + return df + + +def adapt_api_to_parquet_schema(df): + return ( + df.rename({ + "id": "id", + "link": "lien", + "author": "auteur", + "date": "date", + "heure-debut": "heure-de-debut", + "heure-fin": "heure-de-fin", + "latitude": "latitude", + "longitude": "longitude", + "photos": "images", + "espece": "nom_scientifique", + "common": "nom_commun", + }) + .with_columns([ + pl.col("lien").str.split("/").list.get(-1).alias("titre"), # dernier segment du lien + pl.lit("").alias("validee"), + pl.lit("TBD").alias("espece_identifiable_?"), + pl.lit("API").alias("protocole"), + ]) + ) +def load_biolit_from_api() -> pl.DataFrame: + df_api = fetch_biolit_api() + if df_api.is_empty(): + return df_api + print(adapt_api_to_parquet_schema(df_api).head()) + print(adapt_api_to_parquet_schema(df_api).columns) + return adapt_api_to_parquet_schema(df_api) + +def format_observations_from_api(): +### A faire plus tard + return + + +if __name__ == "__main__": + print("Script lancé") + load_biolit_from_api() \ No newline at end of file From f2be0e8854d3e3f00cc593a60b54fcc46ff49b4b Mon Sep 17 00:00:00 2001 From: alexpetit Date: Sun, 15 Mar 2026 12:49:14 +0100 Subject: [PATCH 2/5] nouvelle API URL, modification des champs avec tous les champs de l'API, ajout de test(limite aux 5 premieres pages pour le moment) --- biolit/export_api.py | 165 ++++++++++++++++++++++++++------------- tests/test_export_api.py | 69 ++++++++++++++++ 2 files changed, 178 insertions(+), 56 deletions(-) create mode 100644 tests/test_export_api.py diff --git a/biolit/export_api.py b/biolit/export_api.py index 0697ba0..93ae428 100644 --- a/biolit/export_api.py +++ b/biolit/export_api.py @@ -1,65 +1,118 @@ -import polars as pl import requests +import polars as pl +import structlog + +LOGGER = structlog.get_logger() -###Test export from API +# ------------------------------ +# Helper pour récupérer une clé dans meta +# ------------------------------ +def get_meta(meta: dict, key: str): + """Retourne la première valeur d'une clé meta, ou None si absente""" + if not meta: + return None + value = meta.get(key) + if isinstance(value, list) and value: + return value[0] + return value -def fetch_biolit_api(per_page: int = 1000): + +# ------------------------------ +# FETCH API +# ------------------------------ +def fetch_biolit_from_api(per_page=100, max_pages=5): + """ + Récupère les observations depuis l'API Biolit. + Limite par défaut à max_pages pour éviter les 150+ pages. + """ + url_base = "https://biolit.fr/wp-json/biolitapi/v1/observations" all_data = [] - page = 1 - print("Téléchargement des données depuis l'API Biolit...") - while True: - url = f"https://biolit.fr/wp-json/biolitapi/v1/observations/all?per_page={per_page}&page={page}" - r = requests.get(url) - data = r.json() + + for page in range(1, max_pages + 1): + url = f"{url_base}?per_page={per_page}&page={page}" + LOGGER.info(f"Fetching page {page} from API") + response = requests.get(url) + response.raise_for_status() + data = response.json() if not data: break all_data.extend(data) - page += 1 - print(f"Page {page} téléchargée, total observations : {len(all_data)}") - - if not all_data: - return pl.DataFrame([]) - df = pl.DataFrame(all_data) - print(df.head()) - print(df.shape) - return df + LOGGER.info(f"Fetched {len(all_data)} observations total") + return all_data + + +# ------------------------------ +# ADAPT API -> PARQUET +# ------------------------------ +def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame: + """ + Transforme la structure API Biolit en DataFrame pour parquet. + """ + rows = [] + + for item in data: + obs = item.get("observation", {}) + meta = obs.get("meta", {}) + parents = item.get("parents", {}) + especes = item.get("especes", []) + + quadra = parents.get("quadra", {}) + abb = parents.get("abb", {}) + quadra_meta = quadra.get("meta", {}) + abb_meta = abb.get("meta", {}) + + # Gestion des espèces + nom_scientifique = None + nom_commun = None + nombre_mollusques = None + + if especes: + nom_scientifique = especes[0].get("nom") + nombre_mollusques = especes[0].get("nombre_presents") + + row = { + # Niveau N1 (Quadra) + "protocole": get_meta(meta, "jet_tax__protocole"), + "ID - N1": quadra.get("ID"), + "titre - N1": quadra.get("title"), + "lien - N1": get_meta(meta, "_url_sortie"), + "auteur - N1": None, + "images - N1": None, + "date - N1": obs.get("date"), + "heure-de-debut - N1": get_meta(meta, "heure-debut"), + "heure-de-fin - N1": get_meta(meta, "heure-fin"), + "latitude - N1": get_meta(meta, "latitude"), + "longitude - N1": get_meta(meta, "longitude"), + "relais-local - N1": get_meta(abb_meta, "relais-local"), + "nom du lieu - N1": get_meta(abb_meta, "nom-du-lieu-abb"), + + # Observation + "ID - observation": obs.get("ID"), + "titre - observation": obs.get("title"), + "lien - observation": obs.get("link"), + "Nom scientifique - observation": nom_scientifique, + "Nom commun - observation": nom_commun, + "programme espèce": get_meta(meta, "jet_tax__categorie-programme"), + "images - observation": obs.get("images"), + "nombre de mollusques - observation": nombre_mollusques, + "validee - observation": get_meta(meta, "validee"), + "espece identifiable ? - observation": get_meta(meta, "espece-identifiee"), + } + + rows.append(row) + + return pl.DataFrame(rows) + + +# ------------------------------ +# LOAD (Fetch + Adapt) +# ------------------------------ +def load_biolit_from_api(per_page=100, max_pages=5) -> pl.DataFrame: + """ + Récupère et transforme les données Biolit depuis l'API. + """ + raw_data = fetch_biolit_from_api(per_page=per_page, max_pages=max_pages) + df = adapt_api_to_parquet_schema(raw_data) + return df -def adapt_api_to_parquet_schema(df): - return ( - df.rename({ - "id": "id", - "link": "lien", - "author": "auteur", - "date": "date", - "heure-debut": "heure-de-debut", - "heure-fin": "heure-de-fin", - "latitude": "latitude", - "longitude": "longitude", - "photos": "images", - "espece": "nom_scientifique", - "common": "nom_commun", - }) - .with_columns([ - pl.col("lien").str.split("/").list.get(-1).alias("titre"), # dernier segment du lien - pl.lit("").alias("validee"), - pl.lit("TBD").alias("espece_identifiable_?"), - pl.lit("API").alias("protocole"), - ]) - ) -def load_biolit_from_api() -> pl.DataFrame: - df_api = fetch_biolit_api() - if df_api.is_empty(): - return df_api - print(adapt_api_to_parquet_schema(df_api).head()) - print(adapt_api_to_parquet_schema(df_api).columns) - return adapt_api_to_parquet_schema(df_api) - -def format_observations_from_api(): -### A faire plus tard - return - - -if __name__ == "__main__": - print("Script lancé") - load_biolit_from_api() \ No newline at end of file diff --git a/tests/test_export_api.py b/tests/test_export_api.py new file mode 100644 index 0000000..0cea68a --- /dev/null +++ b/tests/test_export_api.py @@ -0,0 +1,69 @@ + +from biolit.export_api import fetch_biolit_from_api, load_biolit_from_api, adapt_api_to_parquet_schema + +# ------------------------- +# Fonctions d'inspection +# ------------------------- + +def inspect_api_structure(raw_data): + """Affiche les clés du niveau supérieur dans l'API""" + keys = set() + for item in raw_data: + keys.update(item.keys()) + print("TOP LEVEL KEYS:") + for k in sorted(keys): + print("-", k) + +def inspect_meta_keys(raw_data): + """Affiche toutes les clés présentes dans le champ 'meta' de chaque observation""" + meta_keys = set() + for item in raw_data: + if "observation" not in item: + continue + meta = item["observation"].get("meta", {}) + meta_keys.update(meta.keys()) + print("META KEYS:") + for k in sorted(meta_keys): + print("-", k) + +def inspect_meta_values(raw_data, field): + """Affiche les valeurs et leur quantité pour un champ du meta""" + values = {} + for item in raw_data: + if "observation" not in item: + continue + meta = item["observation"].get("meta", {}) + val = meta.get(field) + if isinstance(val, list) and val: + val = val[0] + values[val] = values.get(val, 0) + 1 + print(f"\nVALUES FOR {field}:") + for k, v in values.items(): + print(k, ":", v) + +# ------------------------- +# Tests +# ------------------------- + +def test_fetch_small_sample(): + """Test fetch API avec un petit nombre de pages pour debug""" + raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5) # petit sample + assert isinstance(raw_data, list) + print(f"\nNombre total d'éléments récupérés: {len(raw_data)}") + inspect_api_structure(raw_data) + inspect_meta_keys(raw_data) + inspect_meta_values(raw_data, "validee") + +def test_load_and_adapt(): + """Test l'adaptation de l'API vers le schéma parquet""" + raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5) + df = adapt_api_to_parquet_schema(raw_data) + print(f"\nDataframe adapté avec {len(df)} lignes") + print(df.head(10)) # affiche les 10 premières lignes pour vérification + + +if __name__ == "__main__": + print("=== TEST FETCH SMALL SAMPLE ===") + test_fetch_small_sample() + print("\n=== TEST LOAD AND ADAPT ===") + test_load_and_adapt() \ No newline at end of file From a04b0ed253b024c7d2e9c33510d96381d6964452 Mon Sep 17 00:00:00 2001 From: alexpetit Date: Tue, 17 Mar 2026 21:38:07 +0100 Subject: [PATCH 3/5] Fix pre-commit issues and update API processing --- biolit/export_api.py | 1 - tests/test_export_api.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/biolit/export_api.py b/biolit/export_api.py index 93ae428..f3a99a0 100644 --- a/biolit/export_api.py +++ b/biolit/export_api.py @@ -59,7 +59,6 @@ def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame: quadra = parents.get("quadra", {}) abb = parents.get("abb", {}) - quadra_meta = quadra.get("meta", {}) abb_meta = abb.get("meta", {}) # Gestion des espèces diff --git a/tests/test_export_api.py b/tests/test_export_api.py index 0cea68a..8560417 100644 --- a/tests/test_export_api.py +++ b/tests/test_export_api.py @@ -1,5 +1,5 @@ -from biolit.export_api import fetch_biolit_from_api, load_biolit_from_api, adapt_api_to_parquet_schema +from biolit.export_api import fetch_biolit_from_api, adapt_api_to_parquet_schema # ------------------------- # Fonctions d'inspection From b9414c9a09b4f2fd11505fafcd89dbe7ee406341 Mon Sep 17 00:00:00 2001 From: alexpetit Date: Sun, 22 Mar 2026 17:03:53 +0100 Subject: [PATCH 4/5] Changement de l'API + ajout de postgre --- biolit/export_api.py | 152 ++++++++++++----------------- biolit/postgres.py | 124 +++++++++++++++++++++++ ml/yolov8_DINO/README.md | 2 +- pipelines/run.py | 22 +++++ pyproject.toml | 3 + tests/test_export_api.py | 206 ++++++++++++++++++++++++++++----------- uv.lock | 90 +++++++++++++++++ 7 files changed, 452 insertions(+), 147 deletions(-) create mode 100644 biolit/postgres.py create mode 100644 pipelines/run.py diff --git a/biolit/export_api.py b/biolit/export_api.py index f3a99a0..6c4a510 100644 --- a/biolit/export_api.py +++ b/biolit/export_api.py @@ -1,117 +1,89 @@ import requests import polars as pl import structlog +import re +import os LOGGER = structlog.get_logger() # ------------------------------ -# Helper pour récupérer une clé dans meta +# FETCH API # ------------------------------ -def get_meta(meta: dict, key: str): - """Retourne la première valeur d'une clé meta, ou None si absente""" - if not meta: - return None - value = meta.get(key) - if isinstance(value, list) and value: - return value[0] - return value +def fetch_biolit_from_api(): + + url = os.getenv("BIOLIT_API_URL") + + response = requests.get(url) + response.raise_for_status() + data = response.json() + + print(f"{len(data)} observations récupérées") + return data # ------------------------------ -# FETCH API +# RENAME OF COLUMNS # ------------------------------ -def fetch_biolit_from_api(per_page=100, max_pages=5): - """ - Récupère les observations depuis l'API Biolit. - Limite par défaut à max_pages pour éviter les 150+ pages. - """ - url_base = "https://biolit.fr/wp-json/biolitapi/v1/observations" - all_data = [] - - for page in range(1, max_pages + 1): - url = f"{url_base}?per_page={per_page}&page={page}" - LOGGER.info(f"Fetching page {page} from API") - response = requests.get(url) - response.raise_for_status() - data = response.json() - if not data: - break - all_data.extend(data) - - LOGGER.info(f"Fetched {len(all_data)} observations total") - return all_data + + +def normalize_column_name(col: str) -> str: + """Convertit les noms API en snake_case propre FR""" + col = col.lower() + col = col.replace("-", "_") + col = col.replace(" ", "_") + col = col.replace("é", "e").replace("è", "e").replace("à", "a") + col = col.replace("ù", "u").replace("ô", "o") + col = re.sub(r"[^a-z0-9_]", "", col) + return col + + +COLUMN_MAPPING = { + "id": "id_observation", + "date": "date_observation", + "link": "lien_observation", + "author": "observateur", + "_url_sortie": "url_sortie", + "espece-identifiee": "espece_identifiee", + "heure-debut": "heure_debut", + "heure-fin": "heure_fin", + "latitude": "latitude", + "longitude": "longitude", + "photos": "photos", + "relais": "relais", + "espece_id": "id_espece", + "espece": "nom_scientifique", + "common": "nom_commun", + "categorie-programme": "categorie_programme", + "programme": "programme", +} # ------------------------------ # ADAPT API -> PARQUET # ------------------------------ -def adapt_api_to_parquet_schema(data: list) -> pl.DataFrame: - """ - Transforme la structure API Biolit en DataFrame pour parquet. - """ +def adapt_api_to_dataframe(data: list) -> pl.DataFrame: rows = [] for item in data: - obs = item.get("observation", {}) - meta = obs.get("meta", {}) - parents = item.get("parents", {}) - especes = item.get("especes", []) - - quadra = parents.get("quadra", {}) - abb = parents.get("abb", {}) - abb_meta = abb.get("meta", {}) - - # Gestion des espèces - nom_scientifique = None - nom_commun = None - nombre_mollusques = None - - if especes: - nom_scientifique = especes[0].get("nom") - nombre_mollusques = especes[0].get("nombre_presents") - - row = { - # Niveau N1 (Quadra) - "protocole": get_meta(meta, "jet_tax__protocole"), - "ID - N1": quadra.get("ID"), - "titre - N1": quadra.get("title"), - "lien - N1": get_meta(meta, "_url_sortie"), - "auteur - N1": None, - "images - N1": None, - "date - N1": obs.get("date"), - "heure-de-debut - N1": get_meta(meta, "heure-debut"), - "heure-de-fin - N1": get_meta(meta, "heure-fin"), - "latitude - N1": get_meta(meta, "latitude"), - "longitude - N1": get_meta(meta, "longitude"), - "relais-local - N1": get_meta(abb_meta, "relais-local"), - "nom du lieu - N1": get_meta(abb_meta, "nom-du-lieu-abb"), - - # Observation - "ID - observation": obs.get("ID"), - "titre - observation": obs.get("title"), - "lien - observation": obs.get("link"), - "Nom scientifique - observation": nom_scientifique, - "Nom commun - observation": nom_commun, - "programme espèce": get_meta(meta, "jet_tax__categorie-programme"), - "images - observation": obs.get("images"), - "nombre de mollusques - observation": nombre_mollusques, - "validee - observation": get_meta(meta, "validee"), - "espece identifiable ? - observation": get_meta(meta, "espece-identifiee"), - } - - rows.append(row) - - return pl.DataFrame(rows) + new_row = {} + + for key, value in item.items(): + # mapping si connu, sinon normalisation auto + new_key = COLUMN_MAPPING.get(key, normalize_column_name(key)) + new_row[new_key] = value + + rows.append(new_row) + + df = pl.DataFrame(rows) + + return df # ------------------------------ # LOAD (Fetch + Adapt) # ------------------------------ -def load_biolit_from_api(per_page=100, max_pages=5) -> pl.DataFrame: - """ - Récupère et transforme les données Biolit depuis l'API. - """ - raw_data = fetch_biolit_from_api(per_page=per_page, max_pages=max_pages) - df = adapt_api_to_parquet_schema(raw_data) +def load_biolit_from_api() -> pl.DataFrame: + raw_data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(raw_data) return df diff --git a/biolit/postgres.py b/biolit/postgres.py new file mode 100644 index 0000000..247df20 --- /dev/null +++ b/biolit/postgres.py @@ -0,0 +1,124 @@ +import os +import polars as pl +from sqlalchemy import create_engine, text + + +# ------------------------- +# Connexion DB +# ------------------------- + +def get_engine(): + postgres_url = os.getenv("POSTGRES_URL") + + if not postgres_url: + raise ValueError("Missing POSTGRES_URL") + + return create_engine(postgres_url) + + +# ------------------------- +# Préparation des données +# ------------------------- +def prepare_dataframe_for_postgres(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns([ + + # ------------------------- + # IDs + # ------------------------- + pl.col("id_observation") + .cast(pl.Int64), + + pl.col("id_espece") + .cast(pl.Float64, strict=False) + .fill_nan(None) + .cast(pl.Int64, strict=False), + + pl.col("categorie_programme") + .cast(pl.Float64, strict=False) + .fill_nan(None) + .cast(pl.Int64, strict=False), + + pl.col("relais") + .cast(pl.Utf8) + .replace("", None) + .cast(pl.Float64, strict=False) + .fill_nan(None) + .cast(pl.Int64, strict=False), + + # ------------------------- + # Coordonnées + # ------------------------- + pl.col("latitude") + .cast(pl.Utf8) + .str.strip_chars() + .cast(pl.Float64, strict=False), + + pl.col("longitude") + .cast(pl.Utf8) + .str.strip_chars() + .cast(pl.Float64, strict=False), + + # ------------------------- + # Dates + # ------------------------- + pl.col("date_observation") + .str.strptime(pl.Datetime, strict=False), + + pl.col("heure_debut") + .str.strptime(pl.Time, strict=False), + + pl.col("heure_fin") + .str.strptime(pl.Time, strict=False), + ]) + +# ------------------------- +# Insert avec sécurité (UPSERT) +# ------------------------- + +def insert_dataframe(df: pl.DataFrame): + engine = get_engine() + + rows = df.to_dicts() + + with engine.begin() as conn: + for row in rows: + conn.execute(text(""" + INSERT INTO observations ( + id_observation, + date_observation, + lien_observation, + observateur, + url_sortie, + espece_identifiee, + heure_debut, + heure_fin, + latitude, + longitude, + photos, + relais, + id_espece, + nom_scientifique, + nom_commun, + categorie_programme, + programme + ) VALUES ( + :id_observation, + :date_observation, + :lien_observation, + :observateur, + :url_sortie, + :espece_identifiee, + :heure_debut, + :heure_fin, + :latitude, + :longitude, + :photos, + :relais, + :id_espece, + :nom_scientifique, + :nom_commun, + :categorie_programme, + :programme + ) + ON CONFLICT (id_observation) DO NOTHING + """), row) \ No newline at end of file diff --git a/ml/yolov8_DINO/README.md b/ml/yolov8_DINO/README.md index 64781c8..7005c4b 100644 --- a/ml/yolov8_DINO/README.md +++ b/ml/yolov8_DINO/README.md @@ -16,7 +16,7 @@ Pipeline de constitution du dataset Biolit pour inférence YOLO / Grounding DINO ├── images/ │ ├── identifiable/ │ └── non_identifiable/ # à valider - ├── labels/ + ├── labels/ │ ├── identifiable/ │ └── non_identifiable/ # à valider ├── metadata.csv # GroundingDINO diff --git a/pipelines/run.py b/pipelines/run.py new file mode 100644 index 0000000..e63d662 --- /dev/null +++ b/pipelines/run.py @@ -0,0 +1,22 @@ +from biolit.export_api import fetch_biolit_from_api, adapt_api_to_dataframe +from biolit.postgres import prepare_dataframe_for_postgres, insert_dataframe + + +def run_pipeline(): + print("Fetching data...") + data = fetch_biolit_from_api() + + print("Transforming...") + df = adapt_api_to_dataframe(data) + + print("Preparing for Postgres...") + df = prepare_dataframe_for_postgres(df) + + print("Loading into Postgres...") + insert_dataframe(df) + + print("DONE ✅") + + +if __name__ == "__main__": + run_pipeline() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2536fd8..9df7cb0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,9 +21,12 @@ dependencies = [ "plotly>=6.5.0", "polars>=1.36.1", "pre-commit>=4.5.1", + "psycopg2-binary>=2.9.11", + "pyarrow>=23.0.1", "pytest>=9.0.2", "requests>=2.32.3", "ruff>=0.14.10", + "sqlalchemy>=2.0.44", "structlog>=25.5.0", "tqdm>=4.67.3", "transformers>=5.2.0", diff --git a/tests/test_export_api.py b/tests/test_export_api.py index 8560417..19ca0f0 100644 --- a/tests/test_export_api.py +++ b/tests/test_export_api.py @@ -1,69 +1,163 @@ +from biolit.export_api import fetch_biolit_from_api, adapt_api_to_dataframe + +# ------------------------- +# Tests API +# ------------------------- + +def test_fetch_api_returns_data(): + """Vérifie que l'API retourne bien des données""" + data = fetch_biolit_from_api() + + assert isinstance(data, list) + assert len(data) > 0 + + print(f"\n✅ {len(data)} observations récupérées") + + +def test_fetch_api_structure(): + """Vérifie la structure des données API""" + data = fetch_biolit_from_api() + sample = data[0] + + expected_keys = { + "id", + "date", + "link", + "author", + "_url_sortie", + "espece-identifiee", + "heure-debut", + "heure-fin", + "latitude", + "longitude", + "photos", + "relais", + "espece_id", + "espece", + "common", + } + + missing_keys = expected_keys - set(sample.keys()) + + assert len(missing_keys) == 0, f"Champs manquants: {missing_keys}" + + print("\n✅ Structure API valide") + + +# ------------------------- +# Tests transformation +# ------------------------- + +def test_adapt_to_dataframe(): + """Vérifie la transformation en DataFrame""" + data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(data) + + assert df.shape[0] > 0 + assert df.shape[1] > 0 + + print(f"\n✅ DataFrame: {df.shape[0]} lignes, {df.shape[1]} colonnes") + + +def test_expected_columns_present(): + """Vérifie les colonnes critiques""" + data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(data) + + expected_columns = { + "id_observation", + "date_observation", + "nom_scientifique", + "nom_commun", + "latitude", + "longitude", + } + + missing = expected_columns - set(df.columns) + + assert len(missing) == 0, f"Colonnes manquantes: {missing}" + + print("\n✅ Colonnes critiques présentes") -from biolit.export_api import fetch_biolit_from_api, adapt_api_to_parquet_schema # ------------------------- -# Fonctions d'inspection +# Tests qualité des données # ------------------------- -def inspect_api_structure(raw_data): - """Affiche les clés du niveau supérieur dans l'API""" - keys = set() - for item in raw_data: - keys.update(item.keys()) - print("TOP LEVEL KEYS:") - for k in sorted(keys): - print("-", k) - -def inspect_meta_keys(raw_data): - """Affiche toutes les clés présentes dans le champ 'meta' de chaque observation""" - meta_keys = set() - for item in raw_data: - if "observation" not in item: - continue - meta = item["observation"].get("meta", {}) - meta_keys.update(meta.keys()) - print("META KEYS:") - for k in sorted(meta_keys): - print("-", k) - -def inspect_meta_values(raw_data, field): - """Affiche les valeurs et leur quantité pour un champ du meta""" - values = {} - for item in raw_data: - if "observation" not in item: - continue - meta = item["observation"].get("meta", {}) - val = meta.get(field) - if isinstance(val, list) and val: - val = val[0] - values[val] = values.get(val, 0) + 1 - print(f"\nVALUES FOR {field}:") - for k, v in values.items(): - print(k, ":", v) +def test_unique_ids(): + """Vérifie qu'il n'y a pas de doublons""" + data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(data) + + total = df.shape[0] + unique = df.select("id_observation").n_unique() + + assert total == unique, "Doublons détectés sur id_observation" + + print("\n✅ Pas de doublons") + + +def test_no_null_coordinates(): + """Vérifie que les coordonnées sont présentes""" + data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(data) + + null_lat = df.filter(df["latitude"].is_null()).shape[0] + null_lon = df.filter(df["longitude"].is_null()).shape[0] + + print(f"\nNull latitude: {null_lat}") + print(f"Null longitude: {null_lon}") + + # tolérance possible, donc pas assert strict + assert null_lat < df.shape[0] + assert null_lon < df.shape[0] + + +def test_id_is_numeric(): + """Vérifie que les IDs sont bien numériques""" + data = fetch_biolit_from_api() + + ids = [item["id"] for item in data] + + # doit pouvoir être cast en int + for i in ids[:100]: # test sur sample + int(i) + + print("\n✅ IDs valides") + # ------------------------- -# Tests +# Test global pipeline # ------------------------- -def test_fetch_small_sample(): - """Test fetch API avec un petit nombre de pages pour debug""" - raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5) # petit sample - assert isinstance(raw_data, list) - print(f"\nNombre total d'éléments récupérés: {len(raw_data)}") - inspect_api_structure(raw_data) - inspect_meta_keys(raw_data) - inspect_meta_values(raw_data, "validee") +def test_full_pipeline(): + """Test end-to-end""" + data = fetch_biolit_from_api() + df = adapt_api_to_dataframe(data) + + assert df.shape[0] > 0 -def test_load_and_adapt(): - """Test l'adaptation de l'API vers le schéma parquet""" - raw_data = fetch_biolit_from_api(per_page=1000, max_pages=5) - df = adapt_api_to_parquet_schema(raw_data) - print(f"\nDataframe adapté avec {len(df)} lignes") - print(df.head(10)) # affiche les 10 premières lignes pour vérification + print("\n=== PIPELINE OK ===") + print(df.head(5)) +# ------------------------- +# Execution directe +# ------------------------- + if __name__ == "__main__": - print("=== TEST FETCH SMALL SAMPLE ===") - test_fetch_small_sample() - print("\n=== TEST LOAD AND ADAPT ===") - test_load_and_adapt() \ No newline at end of file + print("=== TEST FETCH API ===") + test_fetch_api_returns_data() + test_fetch_api_structure() + + print("\n=== TEST TRANSFORMATION ===") + test_adapt_to_dataframe() + test_expected_columns_present() + + print("\n=== TEST QUALITE ===") + test_unique_ids() + test_no_null_coordinates() + test_id_is_numeric() + + print("\n=== TEST PIPELINE COMPLET ===") + test_full_pipeline() \ No newline at end of file diff --git a/uv.lock b/uv.lock index 91483cd..a43bb38 100644 --- a/uv.lock +++ b/uv.lock @@ -27,9 +27,12 @@ dependencies = [ { name = "plotly" }, { name = "polars" }, { name = "pre-commit" }, + { name = "psycopg2-binary" }, + { name = "pyarrow" }, { name = "pytest" }, { name = "requests" }, { name = "ruff" }, + { name = "sqlalchemy" }, { name = "structlog" }, { name = "tqdm" }, { name = "transformers" }, @@ -53,9 +56,12 @@ requires-dist = [ { name = "plotly", specifier = ">=6.5.0" }, { name = "polars", specifier = ">=1.36.1" }, { name = "pre-commit", specifier = ">=4.5.1" }, + { name = "psycopg2-binary", specifier = ">=2.9.11" }, + { name = "pyarrow", specifier = ">=23.0.1" }, { name = "pytest", specifier = ">=9.0.2" }, { name = "requests", specifier = ">=2.32.3" }, { name = "ruff", specifier = ">=0.14.10" }, + { name = "sqlalchemy", specifier = ">=2.0.44" }, { name = "structlog", specifier = ">=25.5.0" }, { name = "tqdm", specifier = ">=4.67.3" }, { name = "transformers", specifier = ">=5.2.0" }, @@ -2053,6 +2059,47 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" }, ] +[[package]] +name = "psycopg2-binary" +version = "2.9.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ac/6c/8767aaa597ba424643dc87348c6f1754dd9f48e80fdc1b9f7ca5c3a7c213/psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c", size = 379620, upload-time = "2025-10-10T11:14:48.041Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4", size = 3756603, upload-time = "2025-10-10T11:11:52.213Z" }, + { url = "https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc", size = 3864509, upload-time = "2025-10-10T11:11:56.452Z" }, + { url = "https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a", size = 4411159, upload-time = "2025-10-10T11:12:00.49Z" }, + { url = "https://files.pythonhosted.org/packages/6f/a0/567f7ea38b6e1c62aafd58375665a547c00c608a471620c0edc364733e13/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e", size = 4468234, upload-time = "2025-10-10T11:12:04.892Z" }, + { url = "https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db", size = 4166236, upload-time = "2025-10-10T11:12:11.674Z" }, + { url = "https://files.pythonhosted.org/packages/3c/94/c1777c355bc560992af848d98216148be5f1be001af06e06fc49cbded578/psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757", size = 3983083, upload-time = "2025-10-30T02:55:15.73Z" }, + { url = "https://files.pythonhosted.org/packages/bd/42/c9a21edf0e3daa7825ed04a4a8588686c6c14904344344a039556d78aa58/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3", size = 3652281, upload-time = "2025-10-10T11:12:17.713Z" }, + { url = "https://files.pythonhosted.org/packages/12/22/dedfbcfa97917982301496b6b5e5e6c5531d1f35dd2b488b08d1ebc52482/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a", size = 3298010, upload-time = "2025-10-10T11:12:22.671Z" }, + { url = "https://files.pythonhosted.org/packages/66/ea/d3390e6696276078bd01b2ece417deac954dfdd552d2edc3d03204416c0c/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34", size = 3044641, upload-time = "2025-10-30T02:55:19.929Z" }, + { url = "https://files.pythonhosted.org/packages/12/9a/0402ded6cbd321da0c0ba7d34dc12b29b14f5764c2fc10750daa38e825fc/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d", size = 3347940, upload-time = "2025-10-10T11:12:26.529Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d", size = 2714147, upload-time = "2025-10-10T11:12:29.535Z" }, + { url = "https://files.pythonhosted.org/packages/ff/a8/a2709681b3ac11b0b1786def10006b8995125ba268c9a54bea6f5ae8bd3e/psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c", size = 3756572, upload-time = "2025-10-10T11:12:32.873Z" }, + { url = "https://files.pythonhosted.org/packages/62/e1/c2b38d256d0dafd32713e9f31982a5b028f4a3651f446be70785f484f472/psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee", size = 3864529, upload-time = "2025-10-10T11:12:36.791Z" }, + { url = "https://files.pythonhosted.org/packages/11/32/b2ffe8f3853c181e88f0a157c5fb4e383102238d73c52ac6d93a5c8bffe6/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0", size = 4411242, upload-time = "2025-10-10T11:12:42.388Z" }, + { url = "https://files.pythonhosted.org/packages/10/04/6ca7477e6160ae258dc96f67c371157776564679aefd247b66f4661501a2/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766", size = 4468258, upload-time = "2025-10-10T11:12:48.654Z" }, + { url = "https://files.pythonhosted.org/packages/3c/7e/6a1a38f86412df101435809f225d57c1a021307dd0689f7a5e7fe83588b1/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3", size = 4166295, upload-time = "2025-10-10T11:12:52.525Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7d/c07374c501b45f3579a9eb761cbf2604ddef3d96ad48679112c2c5aa9c25/psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f", size = 3983133, upload-time = "2025-10-30T02:55:24.329Z" }, + { url = "https://files.pythonhosted.org/packages/82/56/993b7104cb8345ad7d4516538ccf8f0d0ac640b1ebd8c754a7b024e76878/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4", size = 3652383, upload-time = "2025-10-10T11:12:56.387Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ac/eaeb6029362fd8d454a27374d84c6866c82c33bfc24587b4face5a8e43ef/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c", size = 3298168, upload-time = "2025-10-10T11:13:00.403Z" }, + { url = "https://files.pythonhosted.org/packages/2b/39/50c3facc66bded9ada5cbc0de867499a703dc6bca6be03070b4e3b65da6c/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60", size = 3044712, upload-time = "2025-10-30T02:55:27.975Z" }, + { url = "https://files.pythonhosted.org/packages/9c/8e/b7de019a1f562f72ada81081a12823d3c1590bedc48d7d2559410a2763fe/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1", size = 3347549, upload-time = "2025-10-10T11:13:03.971Z" }, + { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" }, + { url = "https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1", size = 3756567, upload-time = "2025-10-10T11:13:11.885Z" }, + { url = "https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f", size = 3864755, upload-time = "2025-10-10T11:13:17.727Z" }, + { url = "https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5", size = 4411646, upload-time = "2025-10-10T11:13:24.432Z" }, + { url = "https://files.pythonhosted.org/packages/5a/bd/a335ce6645334fb8d758cc358810defca14a1d19ffbc8a10bd38a2328565/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8", size = 4468701, upload-time = "2025-10-10T11:13:29.266Z" }, + { url = "https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c", size = 4166293, upload-time = "2025-10-10T11:13:33.336Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e0/f8cc36eadd1b716ab36bb290618a3292e009867e5c97ce4aba908cb99644/psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f", size = 3983184, upload-time = "2025-10-30T02:55:32.483Z" }, + { url = "https://files.pythonhosted.org/packages/53/3e/2a8fe18a4e61cfb3417da67b6318e12691772c0696d79434184a511906dc/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747", size = 3652650, upload-time = "2025-10-10T11:13:38.181Z" }, + { url = "https://files.pythonhosted.org/packages/76/36/03801461b31b29fe58d228c24388f999fe814dfc302856e0d17f97d7c54d/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f", size = 3298663, upload-time = "2025-10-10T11:13:44.878Z" }, + { url = "https://files.pythonhosted.org/packages/97/77/21b0ea2e1a73aa5fa9222b2a6b8ba325c43c3a8d54272839c991f2345656/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b", size = 3044737, upload-time = "2025-10-30T02:55:35.69Z" }, + { url = "https://files.pythonhosted.org/packages/67/69/f36abe5f118c1dca6d3726ceae164b9356985805480731ac6712a63f24f0/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d", size = 3347643, upload-time = "2025-10-10T11:13:53.499Z" }, + { url = "https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316", size = 2803913, upload-time = "2025-10-10T11:13:57.058Z" }, +] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -2071,6 +2118,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "pyarrow" +version = "23.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" }, + { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" }, + { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" }, + { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063, upload-time = "2026-02-16T10:10:17.95Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045, upload-time = "2026-02-16T10:10:25.363Z" }, + { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741, upload-time = "2026-02-16T10:10:33.477Z" }, + { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678, upload-time = "2026-02-16T10:10:39.31Z" }, + { url = "https://files.pythonhosted.org/packages/47/10/2cbe4c6f0fb83d2de37249567373d64327a5e4d8db72f486db42875b08f6/pyarrow-23.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6b8fda694640b00e8af3c824f99f789e836720aa8c9379fb435d4c4953a756b8", size = 34210066, upload-time = "2026-02-16T10:10:45.487Z" }, + { url = "https://files.pythonhosted.org/packages/cb/4f/679fa7e84dadbaca7a65f7cdba8d6c83febbd93ca12fa4adf40ba3b6362b/pyarrow-23.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:8ff51b1addc469b9444b7c6f3548e19dc931b172ab234e995a60aea9f6e6025f", size = 35825526, upload-time = "2026-02-16T10:10:52.266Z" }, + { url = "https://files.pythonhosted.org/packages/f9/63/d2747d930882c9d661e9398eefc54f15696547b8983aaaf11d4a2e8b5426/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:71c5be5cbf1e1cb6169d2a0980850bccb558ddc9b747b6206435313c47c37677", size = 44473279, upload-time = "2026-02-16T10:11:01.557Z" }, + { url = "https://files.pythonhosted.org/packages/b3/93/10a48b5e238de6d562a411af6467e71e7aedbc9b87f8d3a35f1560ae30fb/pyarrow-23.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b6f4f17b43bc39d56fec96e53fe89d94bac3eb134137964371b45352d40d0c2", size = 47585798, upload-time = "2026-02-16T10:11:09.401Z" }, + { url = "https://files.pythonhosted.org/packages/5c/20/476943001c54ef078dbf9542280e22741219a184a0632862bca4feccd666/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fc13fc6c403d1337acab46a2c4346ca6c9dec5780c3c697cf8abfd5e19b6b37", size = 48179446, upload-time = "2026-02-16T10:11:17.781Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b6/5dd0c47b335fcd8edba9bfab78ad961bd0fd55ebe53468cc393f45e0be60/pyarrow-23.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5c16ed4f53247fa3ffb12a14d236de4213a4415d127fe9cebed33d51671113e2", size = 50623972, upload-time = "2026-02-16T10:11:26.185Z" }, + { url = "https://files.pythonhosted.org/packages/d5/09/a532297c9591a727d67760e2e756b83905dd89adb365a7f6e9c72578bcc1/pyarrow-23.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:cecfb12ef629cf6be0b1887f9f86463b0dd3dc3195ae6224e74006be4736035a", size = 27540749, upload-time = "2026-02-16T10:12:23.297Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/38749c4b1303e6ae76b3c80618f84861ae0c55dd3c2273842ea6f8258233/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:29f7f7419a0e30264ea261fdc0e5fe63ce5a6095003db2945d7cd78df391a7e1", size = 34471544, upload-time = "2026-02-16T10:11:32.535Z" }, + { url = "https://files.pythonhosted.org/packages/a3/73/f237b2bc8c669212f842bcfd842b04fc8d936bfc9d471630569132dc920d/pyarrow-23.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:33d648dc25b51fd8055c19e4261e813dfc4d2427f068bcecc8b53d01b81b0500", size = 35949911, upload-time = "2026-02-16T10:11:39.813Z" }, + { url = "https://files.pythonhosted.org/packages/0c/86/b912195eee0903b5611bf596833def7d146ab2d301afeb4b722c57ffc966/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:cd395abf8f91c673dd3589cadc8cc1ee4e8674fa61b2e923c8dd215d9c7d1f41", size = 44520337, upload-time = "2026-02-16T10:11:47.764Z" }, + { url = "https://files.pythonhosted.org/packages/69/c2/f2a717fb824f62d0be952ea724b4f6f9372a17eed6f704b5c9526f12f2f1/pyarrow-23.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:00be9576d970c31defb5c32eb72ef585bf600ef6d0a82d5eccaae96639cf9d07", size = 47548944, upload-time = "2026-02-16T10:11:56.607Z" }, + { url = "https://files.pythonhosted.org/packages/84/a7/90007d476b9f0dc308e3bc57b832d004f848fd6c0da601375d20d92d1519/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c2139549494445609f35a5cda4eb94e2c9e4d704ce60a095b342f82460c73a83", size = 48236269, upload-time = "2026-02-16T10:12:04.47Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3f/b16fab3e77709856eb6ac328ce35f57a6d4a18462c7ca5186ef31b45e0e0/pyarrow-23.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7044b442f184d84e2351e5084600f0d7343d6117aabcbc1ac78eb1ae11eb4125", size = 50604794, upload-time = "2026-02-16T10:12:11.797Z" }, + { url = "https://files.pythonhosted.org/packages/e9/a1/22df0620a9fac31d68397a75465c344e83c3dfe521f7612aea33e27ab6c0/pyarrow-23.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a35581e856a2fafa12f3f54fce4331862b1cfb0bef5758347a858a4aa9d6bae8", size = 27660642, upload-time = "2026-02-16T10:12:17.746Z" }, + { url = "https://files.pythonhosted.org/packages/8d/1b/6da9a89583ce7b23ac611f183ae4843cd3a6cf54f079549b0e8c14031e73/pyarrow-23.0.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:5df1161da23636a70838099d4aaa65142777185cc0cdba4037a18cee7d8db9ca", size = 34238755, upload-time = "2026-02-16T10:12:32.819Z" }, + { url = "https://files.pythonhosted.org/packages/ae/b5/d58a241fbe324dbaeb8df07be6af8752c846192d78d2272e551098f74e88/pyarrow-23.0.1-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:fa8e51cb04b9f8c9c5ace6bab63af9a1f88d35c0d6cbf53e8c17c098552285e1", size = 35847826, upload-time = "2026-02-16T10:12:38.949Z" }, + { url = "https://files.pythonhosted.org/packages/54/a5/8cbc83f04aba433ca7b331b38f39e000efd9f0c7ce47128670e737542996/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:0b95a3994f015be13c63148fef8832e8a23938128c185ee951c98908a696e0eb", size = 44536859, upload-time = "2026-02-16T10:12:45.467Z" }, + { url = "https://files.pythonhosted.org/packages/36/2e/c0f017c405fcdc252dbccafbe05e36b0d0eb1ea9a958f081e01c6972927f/pyarrow-23.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:4982d71350b1a6e5cfe1af742c53dfb759b11ce14141870d05d9e540d13bc5d1", size = 47614443, upload-time = "2026-02-16T10:12:55.525Z" }, + { url = "https://files.pythonhosted.org/packages/af/6b/2314a78057912f5627afa13ba43809d9d653e6630859618b0fd81a4e0759/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c250248f1fe266db627921c89b47b7c06fee0489ad95b04d50353537d74d6886", size = 48232991, upload-time = "2026-02-16T10:13:04.729Z" }, + { url = "https://files.pythonhosted.org/packages/40/f2/1bcb1d3be3460832ef3370d621142216e15a2c7c62602a4ea19ec240dd64/pyarrow-23.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5f4763b83c11c16e5f4c15601ba6dfa849e20723b46aa2617cb4bffe8768479f", size = 50645077, upload-time = "2026-02-16T10:13:14.147Z" }, + { url = "https://files.pythonhosted.org/packages/eb/3f/b1da7b61cd66566a4d4c8383d376c606d1c34a906c3f1cb35c479f59d1aa/pyarrow-23.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:3a4c85ef66c134161987c17b147d6bffdca4566f9a4c1d81a0a01cdf08414ea5", size = 28234271, upload-time = "2026-02-16T10:14:09.397Z" }, + { url = "https://files.pythonhosted.org/packages/b5/78/07f67434e910a0f7323269be7bfbf58699bd0c1d080b18a1ab49ba943fe8/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:17cd28e906c18af486a499422740298c52d7c6795344ea5002a7720b4eadf16d", size = 34488692, upload-time = "2026-02-16T10:13:21.541Z" }, + { url = "https://files.pythonhosted.org/packages/50/76/34cf7ae93ece1f740a04910d9f7e80ba166b9b4ab9596a953e9e62b90fe1/pyarrow-23.0.1-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:76e823d0e86b4fb5e1cf4a58d293036e678b5a4b03539be933d3b31f9406859f", size = 35964383, upload-time = "2026-02-16T10:13:28.63Z" }, + { url = "https://files.pythonhosted.org/packages/46/90/459b827238936d4244214be7c684e1b366a63f8c78c380807ae25ed92199/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a62e1899e3078bf65943078b3ad2a6ddcacf2373bc06379aac61b1e548a75814", size = 44538119, upload-time = "2026-02-16T10:13:35.506Z" }, + { url = "https://files.pythonhosted.org/packages/28/a1/93a71ae5881e99d1f9de1d4554a87be37da11cd6b152239fb5bd924fdc64/pyarrow-23.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:df088e8f640c9fae3b1f495b3c64755c4e719091caf250f3a74d095ddf3c836d", size = 47571199, upload-time = "2026-02-16T10:13:42.504Z" }, + { url = "https://files.pythonhosted.org/packages/88/a3/d2c462d4ef313521eaf2eff04d204ac60775263f1fb08c374b543f79f610/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:46718a220d64677c93bc243af1d44b55998255427588e400677d7192671845c7", size = 48259435, upload-time = "2026-02-16T10:13:49.226Z" }, + { url = "https://files.pythonhosted.org/packages/cc/f1/11a544b8c3d38a759eb3fbb022039117fd633e9a7b19e4841cc3da091915/pyarrow-23.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a09f3876e87f48bc2f13583ab551f0379e5dfb83210391e68ace404181a20690", size = 50629149, upload-time = "2026-02-16T10:13:57.238Z" }, + { url = "https://files.pythonhosted.org/packages/50/f2/c0e76a0b451ffdf0cf788932e182758eb7558953f4f27f1aff8e2518b653/pyarrow-23.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:527e8d899f14bd15b740cd5a54ad56b7f98044955373a17179d5956ddb93d9ce", size = 28365807, upload-time = "2026-02-16T10:14:03.892Z" }, +] + [[package]] name = "pycparser" version = "2.23" From d38fac8713769ce54a4bf2fa6636d9312e4e0a8e Mon Sep 17 00:00:00 2001 From: alexpetit Date: Sun, 22 Mar 2026 17:26:27 +0100 Subject: [PATCH 5/5] Add README --- pipelines/README.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pipelines/README.md b/pipelines/README.md index a568cb6..e0fbc58 100644 --- a/pipelines/README.md +++ b/pipelines/README.md @@ -5,7 +5,30 @@ Les pipelines utilisent le dossier `data/` (non versionné) comme espace de trav ## Flux quotidien (API → ML → Label Studio) -1. **Récupération quotidienne** depuis l'API (à venir) ou CSV local. +1. **Ingestion & préparation des données** depuis l'API. + 1. **Objectifs** + - Récupérer les données depuis l’API Biolit + - Standardiser et nettoyer les données + - Les stocker dans une base PostgreSQL + - Mettre les données à disposition des autres systèmes (ML, dataviz) + 2. **Étapes du pipeline** + 1. Ingestion + - appel à l’API Biolit + - récupération des observations + + 2. Transformation + - normalisation des noms de colonnes + - typage des champs + - nettoyage des données (dates, coordonnées, identifiants) + + 3. Chargement + - insertion dans PostgreSQL + - gestion des doublons via un mécanisme d’UPSERT (ON CONFLICT DO NOTHING) + 3. **Variables d'environnement** + - POSTGRES_URL=postgresql://user:password@host:port/dbname + - BIOLIT_API_URL=https://biolit.fr/wp-json/biolit/v1/observations?token=XXX + 4. **Lancer la pipeline** + - uv run python -m pipelines.run 2. **Qualité** : si l'image est mauvaise → stop. 3. **YOLOv8** : détection + crop. - si aucune détection → **Label Studio (CROP)**