CIRED · MinhHaDuong · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+title: "CIRED.digital: conversational assistant for CIRED publications"
+message: "If you use this software or dataset, please cite it as below."
+type: software
+authors:
+  - family-names: Ha-Duong
+    given-names: Minh
+    email: minh.ha-duong@cnrs.fr
+    orcid: "https://orcid.org/0000-0003-0940-1357"
+    affiliation: "CIRED, CNRS"
+repository-code: "https://github.com/CIRED/cired.digital"
+url: "https://cired.digital"
+license: CECILL-2.1
+date-released: "2025-10-09"
+keywords:
+  - RAG
+  - scientific-publications
+  - chatbot
+  - open-science
+  - HAL
diff --git a/Makefile b/Makefile
@@ -48,7 +48,10 @@ REFERENCE_FIGURES := \
 	session_event_type_transitions.png \
 	session_event_type_transitions_simplified.png
 
-.PHONY: figures csv stats analysis clean-figures clean-csv clean-stats test
+ANON_DIR := $(PROJECT_ROOT)/reports/monitor-logs-anon
+ANON_ZIP := $(PROJECT_ROOT)/reports/monitor-logs-anon_$(shell date -u +%Y%m%d).zip
+
+.PHONY: figures csv stats analysis dataset clean-figures clean-csv clean-stats test
 
 figures: $(FIGURES)
 
@@ -62,22 +65,22 @@ $(REPORT_DIR):
 	mkdir -p $@
 
 $(REPORT_DIR)/viz1_session_activity_timeline.png: $(SRC_ANALYSIS)/fig_activity.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py
+	LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py
 	@test -f $@
 
 $(REPORT_DIR)/visitors_origin.png: $(SRC_ANALYSIS)/fig_provenance.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py
+	LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py
 	@test -f $@
 
 $(REPORT_DIR)/session_event_type_transitions.png: $(SRC_ANALYSIS)/fig_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py
+	LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py
 	@test -f $@
 
 $(REPORT_DIR)/session_event_type_transitions_simplified.png: $(REPORT_DIR)/session_event_type_transitions.png
 	@test -f $@
 
 $(REPORT_DIR)/Queries.csv: $(SRC_ANALYSIS)/tabulate_queries.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	$(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py
+	LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py
 	@test -f $@
 
 $(REPORT_DIR)/UniqueQueries.csv: $(REPORT_DIR)/Queries.csv
@@ -98,10 +101,16 @@ $(REPORT_DIR)/article_analysis_monthly.csv: $(REPORT_DIR)/article_analysis.csv
 	@test -f $@
 
 $(REPORT_DIR)/describe_events_summary.txt: $(SRC_ANALYSIS)/describe_events.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	$(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@
+	LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@
 
 $(REPORT_DIR)/describe_sessions_summary.txt: $(SRC_ANALYSIS)/describe_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
-	$(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@
+	LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@
+
+dataset:
+	rm -rf $(ANON_DIR)
+	$(RUNPY) $(SRC_ANALYSIS)/anonymize_monitor_logs.py \
+		--input $(LOGS_ROOT) --output $(ANON_DIR) --zip
+	@echo "Dataset archive: $$(ls $(PROJECT_ROOT)/reports/monitor-logs-anon_*.zip)"
 
 test: analysis
 	@fail=0; \

diff --git a/reports/monitor-logs-anon_20251218.zip b/reports/monitor-logs-anon_20251218.zip
diff --git a/reports/monitor-logs-anon_20260210.zip b/reports/monitor-logs-anon_20260210.zip
diff --git a/src/analysis/anonymize_monitor_logs.py b/src/analysis/anonymize_monitor_logs.py
@@ -4,7 +4,11 @@
 
 What it does:
 - Recursively walks an input monitor-logs tree (default: reports/monitor-logs/)
-- Drops PII and identifiers: removes server_context, sessionId, userAgent, headers, profile
+- Replaces PII with anonymized derived fields:
+  - sessionId → deterministic anonymous IDs (anon_0001, anon_0002, …)
+  - server_context → {origin: <geographic classification>} (IP removed)
+  - payload.userAgent → device class label on sessionStart events
+- Drops payload.headers, payload.profile
 - Skips userProfile events entirely
 - Preserves analysis value: keeps eventType, timestamp, payload.query/response and other non-PII fields
 - Writes sanitized JSON files to an output tree mirroring YYYY/MM/DD
@@ -19,7 +23,7 @@
       --zip
 
 Notes:
-- This tool is intentionally dependency-light (stdlib only).
+- Requires classifier.py (sibling module) for IP/UA classification.
 - Timestamps are preserved as-is when present; otherwise, fallback to filename or file mtime.
 - Filenames in the anonymized dataset do not carry session identifiers.
 
@@ -39,6 +43,10 @@
 from pathlib import Path
 from typing import Any
 
+# Ensure sibling modules are importable
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from classifier import classify_ip, classify_ua
+
 # Matches original filenames like:
 #   session_<key>-20250708T131401123Z-request.json
 FILENAME_RE = re.compile(
@@ -108,17 +116,37 @@ def is_user_profile_event(event_type: str) -> bool:
     return event_type == "userProfile"
 
 
-REDACT_PAYLOAD_KEYS = {"userAgent", "headers", "profile"}
+REDACT_PAYLOAD_KEYS = {"headers", "profile"}
+
 
+def redact_document(doc: dict[str, Any], anon_session_id: str) -> dict[str, Any]:
+    """
+    Replace PII with anonymized derived fields.
 
-def redact_document(doc: dict[str, Any]) -> dict[str, Any]:
-    """Remove PII from document: server_context, sessionId, and sensitive payload keys."""
-    # Copy shallowly to avoid mutating input
+    - sessionId → deterministic anonymous ID
+    - server_context → {"origin": <classified>}  (IP removed)
+    - payload.userAgent → classified label on sessionStart events
+    - payload.headers, payload.profile → dropped
+    """
     red: dict[str, Any] = {
         k: v for k, v in doc.items() if k not in {"server_context", "sessionId"}
     }
+    # Anonymized session identifier
+    red["sessionId"] = anon_session_id
+
+    # Replace server_context with origin classification only
+    ctx = doc.get("server_context", {})
+    client_ip = ctx.get("client_ip", "") if isinstance(ctx, dict) else ""
+    red["server_context"] = {"origin": classify_ip(client_ip)}
+
+    # Process payload
     payload = red.get("payload")
     if isinstance(payload, dict):
+        # Replace userAgent with classification on sessionStart events
+        ua = payload.pop("userAgent", None)
+        if ua and doc.get("eventType") == "sessionStart":
+            payload["userAgent"] = classify_ua(ua)
+        # Drop remaining PII keys
         for k in list(payload.keys()):
             if k in REDACT_PAYLOAD_KEYS:
                 payload.pop(k, None)
@@ -177,7 +205,7 @@ def write_metadata(
         "created_at": finished_at,
         "started_at": started_at,
         "version": 1,
-        "notes": "Anonymized dataset with PII removed: server_context, sessionId, userAgent, headers, profile. userProfile events omitted.",
+        "notes": "Anonymized dataset: sessionId replaced with anon IDs, server_context replaced with origin classification, userAgent replaced with device class. headers/profile dropped. userProfile events omitted.",
         "stats": {
             "total_files_seen": stats.total_files,
             "processed_files": stats.processed,
@@ -186,8 +214,18 @@ def write_metadata(
             "wrote_files": stats.wrote_files,
         },
         "schema": {
-            "root_keys": ["timestamp", "eventType", "payload"],
-            "dropped_root_keys": ["server_context", "sessionId"],
+            "root_keys": [
+                "timestamp",
+                "eventType",
+                "sessionId",
+                "server_context",
+                "payload",
+            ],
+            "anonymized_fields": {
+                "sessionId": "replaced with anon_NNNN",
+                "server_context": "replaced with {origin: <classification>}",
+                "payload.userAgent": "replaced with device class on sessionStart",
+            },
             "dropped_payload_keys": sorted(REDACT_PAYLOAD_KEYS),
         },
         "license": {
@@ -219,16 +257,97 @@ def write_schema_doc(out_root: Path) -> None:
         "Each JSON file contains:\n",
         "- `timestamp` (string, ISO 8601, Z)\n",
         "- `eventType` (string)\n",
-        "- `payload` (object; userAgent/headers/profile omitted if originally present)\n",
+        "- `sessionId` (string, anonymized: `anon_NNNN`)\n",
+        "- `server_context` (object: `{origin: <classification>}`, IP removed)\n",
+        "- `payload` (object; headers/profile dropped,"
+        " userAgent replaced with device class on sessionStart)\n",
         "\n",
-        "Removed fields: `server_context`, `sessionId`.\n",
         "Omitted events: `userProfile`.\n",
         "\n",
         "Filenames: `<timestamp>-<eventType>-<seq>.json` grouped under `YYYY/MM/DD/`.\n",
     ]
     (out_root / "SCHEMA.md").write_text("".join(lines), encoding="utf-8")
 
 
+def write_readme(out_root: Path, stats: RedactionStats) -> None:
+    """Write README.md describing the dataset for standalone distribution (e.g. Zenodo)."""
+    text = f"""\
+# CIRED.digital user interaction logs (anonymized)
+
+Anonymized event logs from [CIRED.digital](https://cired.digital), a conversational
+assistant for exploring the scientific publications of
+[CIRED](https://www.centre-cired.fr) hosted on [HAL](https://hal.science).
+
+## Coverage
+
+- **Period**: June 23 – October 9, 2025
+- **Events**: {stats.wrote_files} anonymized JSON files
+- **Source files processed**: {stats.total_files} (of which {stats.skipped_user_profile} userProfile events omitted,
+  {stats.parse_errors} parse errors)
+
+## Structure
+
+```
+README.md              This file
+METADATA.json          Dataset version, stats, schema, license
+SCHEMA.md              JSON field documentation
+LICENSE-DATASET.txt    CC BY 4.0 terms
+CHECKSUMS.sha256       SHA-256 integrity checksums
+YYYY/MM/DD/            Event files grouped by date
+  <timestamp>-<eventType>-<seq>.json
+```
+
+### Event types
+
+| eventType         | Description                              |
+|-------------------|------------------------------------------|
+| sessionStart      | User opens the application               |
+| request           | User submits a query                     |
+| response          | System returns RAG results               |
+| article           | System generates a synthesized article   |
+| feedback          | User gives thumbs up/down                |
+| btnClick          | UI button interaction                    |
+| visibilityChange  | Browser tab shown/hidden                 |
+
+### JSON schema
+
+Each file contains:
+- `timestamp` (string, ISO 8601 UTC)
+- `eventType` (string)
+- `sessionId` (string, anonymized: `anon_NNNN`)
+- `server_context` (object: `{{"origin": "<classification>"}}`, IP address removed)
+- `payload` (object, content varies by event type)
+
+## Anonymization
+
+The following transformations protect user privacy while preserving analytical value:
+- `sessionId` — replaced with deterministic anonymous IDs (`anon_0001`, …)
+- `server_context` — IP address replaced with geographic/org classification only
+- `payload.userAgent` — replaced with device class (e.g. "Desktop") on sessionStart events
+- `payload.headers`, `payload.profile` — dropped entirely
+- All `userProfile` events — dropped entirely
+
+Query text and response content are preserved for research purposes.
+
+## License
+
+This dataset is released under the
+[Creative Commons Attribution 4.0 International (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/)
+license.
+
+## Citation
+
+> Ha-Duong, M. (2025). CIRED.digital user interaction logs (anonymized) [Dataset].
+> https://doi.org/FIXME
+
+## Contact
+
+Minh Ha-Duong <minh.ha-duong@cnrs.fr>
+CIRED — Centre international de recherche sur l'environnement et le développement
+"""
+    (out_root / "README.md").write_text(text, encoding="utf-8")
+
+
 def generate_checksums(out_root: Path) -> None:
     """Generate CHECKSUMS.sha256 file for all JSON outputs."""
     entries: list[tuple[str, str]] = []
@@ -256,6 +375,8 @@ def anonymize_tree(
     """Walk input tree, redact and output anonymized events; return stats."""
     stats = RedactionStats()
     seq_by_dir: dict[Path, int] = {}
+    session_id_map: dict[str, str] = {}
+    session_counter = 0
 
     for src in iter_json_files(in_root):
         stats.total_files += 1
@@ -286,7 +407,13 @@ def anonymize_tree(
                 .replace("+00:00", "Z")
             )
 
-        red = redact_document(doc)
+        # Assign a stable anonymous session ID
+        orig_sid = doc.get("sessionId", key_f or "unknown")
+        if orig_sid not in session_id_map:
+            session_counter += 1
+            session_id_map[orig_sid] = f"anon_{session_counter:04d}"
+
+        red = redact_document(doc, session_id_map[orig_sid])
         red["eventType"] = event_type  # ensure consistent typing
         red["timestamp"] = ts  # ensure normalized
 
@@ -358,6 +485,7 @@ def main() -> None:
 
     write_schema_doc(out_root)
     write_license(out_root)
+    write_readme(out_root, stats)
     write_metadata(
         out_root,
         stats,

diff --git a/src/analysis/logloader.py b/src/analysis/logloader.py
@@ -18,7 +18,12 @@
 import pandas as pd
 from classifier import classify_ip, classify_ua
 
-DEFAULT_BASE_PATH = Path(__file__).resolve().parents[2] / "reports" / "monitor-logs"
+DEFAULT_BASE_PATH = Path(
+    os.environ.get(
+        "LOGS_ROOT",
+        str(Path(__file__).resolve().parents[2] / "reports" / "monitor-logs"),
+    )
+)
 DEFAULT_MIN_DATE = "20250705"
 
 
@@ -159,18 +164,22 @@ def augment_dataframe(events_df: pd.DataFrame) -> None:
     event_types: list[str] = events_df["eventType"].tolist()
     payloads: list[dict[str, Any]] = events_df["payload"].tolist()
 
-    # The client IP from the server_context
+    # The client IP from the server_context (empty for anonymized data)
     ip_values: list[str] = [ctx.get("client_ip", "") for ctx in server_contexts]
     events_df["ip"] = ip_values
-    # The origin label classified from the IP
-    events_df["origin"] = [classify_ip(ip) for ip in ip_values]
+    # The origin label: use pre-computed if available, otherwise classify from IP
+    events_df["origin"] = [
+        ctx.get("origin") or classify_ip(ip)
+        for ctx, ip in zip(server_contexts, ip_values)
+    ]
     # The user agent from payload for "sessionStart" event types
     ua_values: list[str | None] = [
         payload.get("userAgent") if etype == "sessionStart" else None
         for etype, payload in zip(event_types, payloads)
     ]
     events_df["ua"] = ua_values
-    # The classified user agent label
+    # The classified user agent label: if ua is already a class label (anonymized),
+    # classify_ua will return it as-is or map it; for raw ua strings it classifies.
     events_df["ua_class"] = [
         classify_ua(ua) if ua is not None else "??" for ua in ua_values
     ]

diff --git a/src/analysis/tabulate_articles.py b/src/analysis/tabulate_articles.py
@@ -209,7 +209,7 @@ def analyze_articles(logs_root: Path) -> pd.DataFrame:
     records = []
 
     # Find all article files
-    article_files = list(logs_root.rglob("*-article.json"))
+    article_files = list(logs_root.rglob("*-article*.json"))
     print(f"Found {len(article_files)} article files")
 
     for article_file in article_files:

diff --git a/src/analysis/tabulate_tokens.py b/src/analysis/tabulate_tokens.py
@@ -122,7 +122,7 @@ def find_corresponding_request(response_file: Path, query_id: str) -> Path | Non
     parent_dir = response_file.parent
 
     # Try to find request file with the same query ID in the name or content
-    for request_file in parent_dir.glob("*-request.json"):
+    for request_file in parent_dir.glob("*-request*.json"):
         try:
             data = json.loads(request_file.read_text(encoding="utf-8"))
             if data.get("payload", {}).get("queryId") == query_id:
@@ -147,7 +147,7 @@ def analyze_tokens(logs_root: Path) -> pd.DataFrame:
     records = []
 
     # Find all response files
-    response_files = list(logs_root.rglob("*-response.json"))
+    response_files = list(logs_root.rglob("*-response*.json"))
     print(f"Found {len(response_files)} response files")
 
     for response_file in response_files: