diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..fb1dbd7 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,20 @@ +cff-version: 1.2.0 +title: "CIRED.digital: conversational assistant for CIRED publications" +message: "If you use this software or dataset, please cite it as below." +type: software +authors: + - family-names: Ha-Duong + given-names: Minh + email: minh.ha-duong@cnrs.fr + orcid: "https://orcid.org/0000-0003-0940-1357" + affiliation: "CIRED, CNRS" +repository-code: "https://github.com/CIRED/cired.digital" +url: "https://cired.digital" +license: CECILL-2.1 +date-released: "2025-10-09" +keywords: + - RAG + - scientific-publications + - chatbot + - open-science + - HAL diff --git a/Makefile b/Makefile index 3119fb8..9fc70e7 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,10 @@ REFERENCE_FIGURES := \ session_event_type_transitions.png \ session_event_type_transitions_simplified.png -.PHONY: figures csv stats analysis clean-figures clean-csv clean-stats test +ANON_DIR := $(PROJECT_ROOT)/reports/monitor-logs-anon +ANON_ZIP := $(PROJECT_ROOT)/reports/monitor-logs-anon_$(shell date -u +%Y%m%d).zip + +.PHONY: figures csv stats analysis dataset clean-figures clean-csv clean-stats test figures: $(FIGURES) @@ -62,22 +65,22 @@ $(REPORT_DIR): mkdir -p $@ $(REPORT_DIR)/viz1_session_activity_timeline.png: $(SRC_ANALYSIS)/fig_activity.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py + LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py @test -f $@ $(REPORT_DIR)/visitors_origin.png: $(SRC_ANALYSIS)/fig_provenance.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py + LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py @test -f $@ $(REPORT_DIR)/session_event_type_transitions.png: $(SRC_ANALYSIS)/fig_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py + LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py @test -f $@ $(REPORT_DIR)/session_event_type_transitions_simplified.png: $(REPORT_DIR)/session_event_type_transitions.png @test -f $@ $(REPORT_DIR)/Queries.csv: $(SRC_ANALYSIS)/tabulate_queries.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - $(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py + LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py @test -f $@ $(REPORT_DIR)/UniqueQueries.csv: $(REPORT_DIR)/Queries.csv @@ -98,10 +101,16 @@ $(REPORT_DIR)/article_analysis_monthly.csv: $(REPORT_DIR)/article_analysis.csv @test -f $@ $(REPORT_DIR)/describe_events_summary.txt: $(SRC_ANALYSIS)/describe_events.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - $(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@ + LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@ $(REPORT_DIR)/describe_sessions_summary.txt: $(SRC_ANALYSIS)/describe_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR) - $(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@ + LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@ + +dataset: + rm -rf $(ANON_DIR) + $(RUNPY) $(SRC_ANALYSIS)/anonymize_monitor_logs.py \ + --input $(LOGS_ROOT) --output $(ANON_DIR) --zip + @echo "Dataset archive: $$(ls $(PROJECT_ROOT)/reports/monitor-logs-anon_*.zip)" test: analysis @fail=0; \ diff --git a/reports/monitor-logs-anon_20251218.zip b/reports/monitor-logs-anon_20251218.zip deleted file mode 100644 index afdacc4..0000000 --- a/reports/monitor-logs-anon_20251218.zip +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60c993710372292f8419fb5f04262ec21f8582d0b65433a6b1cec1b1c1d8c8dd -size 6957835 diff --git a/reports/monitor-logs-anon_20260210.zip b/reports/monitor-logs-anon_20260210.zip new file mode 100644 index 0000000..481530b --- /dev/null +++ b/reports/monitor-logs-anon_20260210.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0ffd9e81386481fabfb6af177aee526266188ae51104cbcd28c321173bcb97 +size 7123508 diff --git a/src/analysis/anonymize_monitor_logs.py b/src/analysis/anonymize_monitor_logs.py index 7d6fba0..880f15d 100644 --- a/src/analysis/anonymize_monitor_logs.py +++ b/src/analysis/anonymize_monitor_logs.py @@ -4,7 +4,11 @@ What it does: - Recursively walks an input monitor-logs tree (default: reports/monitor-logs/) -- Drops PII and identifiers: removes server_context, sessionId, userAgent, headers, profile +- Replaces PII with anonymized derived fields: + - sessionId → deterministic anonymous IDs (anon_0001, anon_0002, …) + - server_context → {origin: } (IP removed) + - payload.userAgent → device class label on sessionStart events +- Drops payload.headers, payload.profile - Skips userProfile events entirely - Preserves analysis value: keeps eventType, timestamp, payload.query/response and other non-PII fields - Writes sanitized JSON files to an output tree mirroring YYYY/MM/DD @@ -19,7 +23,7 @@ --zip Notes: -- This tool is intentionally dependency-light (stdlib only). +- Requires classifier.py (sibling module) for IP/UA classification. - Timestamps are preserved as-is when present; otherwise, fallback to filename or file mtime. - Filenames in the anonymized dataset do not carry session identifiers. @@ -39,6 +43,10 @@ from pathlib import Path from typing import Any +# Ensure sibling modules are importable +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from classifier import classify_ip, classify_ua + # Matches original filenames like: # session_-20250708T131401123Z-request.json FILENAME_RE = re.compile( @@ -108,17 +116,37 @@ def is_user_profile_event(event_type: str) -> bool: return event_type == "userProfile" -REDACT_PAYLOAD_KEYS = {"userAgent", "headers", "profile"} +REDACT_PAYLOAD_KEYS = {"headers", "profile"} + +def redact_document(doc: dict[str, Any], anon_session_id: str) -> dict[str, Any]: + """ + Replace PII with anonymized derived fields. -def redact_document(doc: dict[str, Any]) -> dict[str, Any]: - """Remove PII from document: server_context, sessionId, and sensitive payload keys.""" - # Copy shallowly to avoid mutating input + - sessionId → deterministic anonymous ID + - server_context → {"origin": } (IP removed) + - payload.userAgent → classified label on sessionStart events + - payload.headers, payload.profile → dropped + """ red: dict[str, Any] = { k: v for k, v in doc.items() if k not in {"server_context", "sessionId"} } + # Anonymized session identifier + red["sessionId"] = anon_session_id + + # Replace server_context with origin classification only + ctx = doc.get("server_context", {}) + client_ip = ctx.get("client_ip", "") if isinstance(ctx, dict) else "" + red["server_context"] = {"origin": classify_ip(client_ip)} + + # Process payload payload = red.get("payload") if isinstance(payload, dict): + # Replace userAgent with classification on sessionStart events + ua = payload.pop("userAgent", None) + if ua and doc.get("eventType") == "sessionStart": + payload["userAgent"] = classify_ua(ua) + # Drop remaining PII keys for k in list(payload.keys()): if k in REDACT_PAYLOAD_KEYS: payload.pop(k, None) @@ -177,7 +205,7 @@ def write_metadata( "created_at": finished_at, "started_at": started_at, "version": 1, - "notes": "Anonymized dataset with PII removed: server_context, sessionId, userAgent, headers, profile. userProfile events omitted.", + "notes": "Anonymized dataset: sessionId replaced with anon IDs, server_context replaced with origin classification, userAgent replaced with device class. headers/profile dropped. userProfile events omitted.", "stats": { "total_files_seen": stats.total_files, "processed_files": stats.processed, @@ -186,8 +214,18 @@ def write_metadata( "wrote_files": stats.wrote_files, }, "schema": { - "root_keys": ["timestamp", "eventType", "payload"], - "dropped_root_keys": ["server_context", "sessionId"], + "root_keys": [ + "timestamp", + "eventType", + "sessionId", + "server_context", + "payload", + ], + "anonymized_fields": { + "sessionId": "replaced with anon_NNNN", + "server_context": "replaced with {origin: }", + "payload.userAgent": "replaced with device class on sessionStart", + }, "dropped_payload_keys": sorted(REDACT_PAYLOAD_KEYS), }, "license": { @@ -219,9 +257,11 @@ def write_schema_doc(out_root: Path) -> None: "Each JSON file contains:\n", "- `timestamp` (string, ISO 8601, Z)\n", "- `eventType` (string)\n", - "- `payload` (object; userAgent/headers/profile omitted if originally present)\n", + "- `sessionId` (string, anonymized: `anon_NNNN`)\n", + "- `server_context` (object: `{origin: }`, IP removed)\n", + "- `payload` (object; headers/profile dropped," + " userAgent replaced with device class on sessionStart)\n", "\n", - "Removed fields: `server_context`, `sessionId`.\n", "Omitted events: `userProfile`.\n", "\n", "Filenames: `--.json` grouped under `YYYY/MM/DD/`.\n", @@ -229,6 +269,85 @@ def write_schema_doc(out_root: Path) -> None: (out_root / "SCHEMA.md").write_text("".join(lines), encoding="utf-8") +def write_readme(out_root: Path, stats: RedactionStats) -> None: + """Write README.md describing the dataset for standalone distribution (e.g. Zenodo).""" + text = f"""\ +# CIRED.digital user interaction logs (anonymized) + +Anonymized event logs from [CIRED.digital](https://cired.digital), a conversational +assistant for exploring the scientific publications of +[CIRED](https://www.centre-cired.fr) hosted on [HAL](https://hal.science). + +## Coverage + +- **Period**: June 23 – October 9, 2025 +- **Events**: {stats.wrote_files} anonymized JSON files +- **Source files processed**: {stats.total_files} (of which {stats.skipped_user_profile} userProfile events omitted, + {stats.parse_errors} parse errors) + +## Structure + +``` +README.md This file +METADATA.json Dataset version, stats, schema, license +SCHEMA.md JSON field documentation +LICENSE-DATASET.txt CC BY 4.0 terms +CHECKSUMS.sha256 SHA-256 integrity checksums +YYYY/MM/DD/ Event files grouped by date + --.json +``` + +### Event types + +| eventType | Description | +|-------------------|------------------------------------------| +| sessionStart | User opens the application | +| request | User submits a query | +| response | System returns RAG results | +| article | System generates a synthesized article | +| feedback | User gives thumbs up/down | +| btnClick | UI button interaction | +| visibilityChange | Browser tab shown/hidden | + +### JSON schema + +Each file contains: +- `timestamp` (string, ISO 8601 UTC) +- `eventType` (string) +- `sessionId` (string, anonymized: `anon_NNNN`) +- `server_context` (object: `{{"origin": ""}}`, IP address removed) +- `payload` (object, content varies by event type) + +## Anonymization + +The following transformations protect user privacy while preserving analytical value: +- `sessionId` — replaced with deterministic anonymous IDs (`anon_0001`, …) +- `server_context` — IP address replaced with geographic/org classification only +- `payload.userAgent` — replaced with device class (e.g. "Desktop") on sessionStart events +- `payload.headers`, `payload.profile` — dropped entirely +- All `userProfile` events — dropped entirely + +Query text and response content are preserved for research purposes. + +## License + +This dataset is released under the +[Creative Commons Attribution 4.0 International (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/) +license. + +## Citation + +> Ha-Duong, M. (2025). CIRED.digital user interaction logs (anonymized) [Dataset]. +> https://doi.org/FIXME + +## Contact + +Minh Ha-Duong +CIRED — Centre international de recherche sur l'environnement et le développement +""" + (out_root / "README.md").write_text(text, encoding="utf-8") + + def generate_checksums(out_root: Path) -> None: """Generate CHECKSUMS.sha256 file for all JSON outputs.""" entries: list[tuple[str, str]] = [] @@ -256,6 +375,8 @@ def anonymize_tree( """Walk input tree, redact and output anonymized events; return stats.""" stats = RedactionStats() seq_by_dir: dict[Path, int] = {} + session_id_map: dict[str, str] = {} + session_counter = 0 for src in iter_json_files(in_root): stats.total_files += 1 @@ -286,7 +407,13 @@ def anonymize_tree( .replace("+00:00", "Z") ) - red = redact_document(doc) + # Assign a stable anonymous session ID + orig_sid = doc.get("sessionId", key_f or "unknown") + if orig_sid not in session_id_map: + session_counter += 1 + session_id_map[orig_sid] = f"anon_{session_counter:04d}" + + red = redact_document(doc, session_id_map[orig_sid]) red["eventType"] = event_type # ensure consistent typing red["timestamp"] = ts # ensure normalized @@ -358,6 +485,7 @@ def main() -> None: write_schema_doc(out_root) write_license(out_root) + write_readme(out_root, stats) write_metadata( out_root, stats, diff --git a/src/analysis/logloader.py b/src/analysis/logloader.py index 4cbde75..bf3a8d0 100644 --- a/src/analysis/logloader.py +++ b/src/analysis/logloader.py @@ -18,7 +18,12 @@ import pandas as pd from classifier import classify_ip, classify_ua -DEFAULT_BASE_PATH = Path(__file__).resolve().parents[2] / "reports" / "monitor-logs" +DEFAULT_BASE_PATH = Path( + os.environ.get( + "LOGS_ROOT", + str(Path(__file__).resolve().parents[2] / "reports" / "monitor-logs"), + ) +) DEFAULT_MIN_DATE = "20250705" @@ -159,18 +164,22 @@ def augment_dataframe(events_df: pd.DataFrame) -> None: event_types: list[str] = events_df["eventType"].tolist() payloads: list[dict[str, Any]] = events_df["payload"].tolist() - # The client IP from the server_context + # The client IP from the server_context (empty for anonymized data) ip_values: list[str] = [ctx.get("client_ip", "") for ctx in server_contexts] events_df["ip"] = ip_values - # The origin label classified from the IP - events_df["origin"] = [classify_ip(ip) for ip in ip_values] + # The origin label: use pre-computed if available, otherwise classify from IP + events_df["origin"] = [ + ctx.get("origin") or classify_ip(ip) + for ctx, ip in zip(server_contexts, ip_values) + ] # The user agent from payload for "sessionStart" event types ua_values: list[str | None] = [ payload.get("userAgent") if etype == "sessionStart" else None for etype, payload in zip(event_types, payloads) ] events_df["ua"] = ua_values - # The classified user agent label + # The classified user agent label: if ua is already a class label (anonymized), + # classify_ua will return it as-is or map it; for raw ua strings it classifies. events_df["ua_class"] = [ classify_ua(ua) if ua is not None else "??" for ua in ua_values ] diff --git a/src/analysis/tabulate_articles.py b/src/analysis/tabulate_articles.py index 82dddbb..66d6161 100644 --- a/src/analysis/tabulate_articles.py +++ b/src/analysis/tabulate_articles.py @@ -209,7 +209,7 @@ def analyze_articles(logs_root: Path) -> pd.DataFrame: records = [] # Find all article files - article_files = list(logs_root.rglob("*-article.json")) + article_files = list(logs_root.rglob("*-article*.json")) print(f"Found {len(article_files)} article files") for article_file in article_files: diff --git a/src/analysis/tabulate_tokens.py b/src/analysis/tabulate_tokens.py index 2e1dd09..ea3cccb 100644 --- a/src/analysis/tabulate_tokens.py +++ b/src/analysis/tabulate_tokens.py @@ -122,7 +122,7 @@ def find_corresponding_request(response_file: Path, query_id: str) -> Path | Non parent_dir = response_file.parent # Try to find request file with the same query ID in the name or content - for request_file in parent_dir.glob("*-request.json"): + for request_file in parent_dir.glob("*-request*.json"): try: data = json.loads(request_file.read_text(encoding="utf-8")) if data.get("payload", {}).get("queryId") == query_id: @@ -147,7 +147,7 @@ def analyze_tokens(logs_root: Path) -> pd.DataFrame: records = [] # Find all response files - response_files = list(logs_root.rglob("*-response.json")) + response_files = list(logs_root.rglob("*-response*.json")) print(f"Found {len(response_files)} response files") for response_file in response_files: