Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
cff-version: 1.2.0
title: "CIRED.digital: conversational assistant for CIRED publications"
message: "If you use this software or dataset, please cite it as below."
type: software
authors:
- family-names: Ha-Duong
given-names: Minh
email: minh.ha-duong@cnrs.fr
orcid: "https://orcid.org/0000-0003-0940-1357"
affiliation: "CIRED, CNRS"
repository-code: "https://github.com/CIRED/cired.digital"
url: "https://cired.digital"
license: CECILL-2.1
date-released: "2025-10-09"
keywords:
- RAG
- scientific-publications
- chatbot
- open-science
- HAL
23 changes: 16 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ REFERENCE_FIGURES := \
session_event_type_transitions.png \
session_event_type_transitions_simplified.png

.PHONY: figures csv stats analysis clean-figures clean-csv clean-stats test
ANON_DIR := $(PROJECT_ROOT)/reports/monitor-logs-anon
ANON_ZIP := $(PROJECT_ROOT)/reports/monitor-logs-anon_$(shell date -u +%Y%m%d).zip

.PHONY: figures csv stats analysis dataset clean-figures clean-csv clean-stats test

figures: $(FIGURES)

Expand All @@ -62,22 +65,22 @@ $(REPORT_DIR):
mkdir -p $@

$(REPORT_DIR)/viz1_session_activity_timeline.png: $(SRC_ANALYSIS)/fig_activity.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py
LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_activity.py
@test -f $@

$(REPORT_DIR)/visitors_origin.png: $(SRC_ANALYSIS)/fig_provenance.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py
LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_provenance.py
@test -f $@

$(REPORT_DIR)/session_event_type_transitions.png: $(SRC_ANALYSIS)/fig_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py
LOGS_ROOT=$(LOGS_ROOT) MPLBACKEND=$(MPLBACKEND) $(RUNPY) $(SRC_ANALYSIS)/fig_sessions.py
@test -f $@

$(REPORT_DIR)/session_event_type_transitions_simplified.png: $(REPORT_DIR)/session_event_type_transitions.png
@test -f $@

$(REPORT_DIR)/Queries.csv: $(SRC_ANALYSIS)/tabulate_queries.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
$(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py
LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/tabulate_queries.py
@test -f $@

$(REPORT_DIR)/UniqueQueries.csv: $(REPORT_DIR)/Queries.csv
Expand All @@ -98,10 +101,16 @@ $(REPORT_DIR)/article_analysis_monthly.csv: $(REPORT_DIR)/article_analysis.csv
@test -f $@

$(REPORT_DIR)/describe_events_summary.txt: $(SRC_ANALYSIS)/describe_events.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
$(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@
LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_events.py > $@

$(REPORT_DIR)/describe_sessions_summary.txt: $(SRC_ANALYSIS)/describe_sessions.py $(SRC_ANALYSIS)/logloader.py | $(REPORT_DIR)
$(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@
LOGS_ROOT=$(LOGS_ROOT) $(RUNPY) $(SRC_ANALYSIS)/describe_sessions.py > $@

dataset:
rm -rf $(ANON_DIR)
$(RUNPY) $(SRC_ANALYSIS)/anonymize_monitor_logs.py \
--input $(LOGS_ROOT) --output $(ANON_DIR) --zip
@echo "Dataset archive: $$(ls $(PROJECT_ROOT)/reports/monitor-logs-anon_*.zip)"

test: analysis
@fail=0; \
Expand Down
3 changes: 0 additions & 3 deletions reports/monitor-logs-anon_20251218.zip

This file was deleted.

3 changes: 3 additions & 0 deletions reports/monitor-logs-anon_20260210.zip
Git LFS file not shown
152 changes: 140 additions & 12 deletions src/analysis/anonymize_monitor_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

What it does:
- Recursively walks an input monitor-logs tree (default: reports/monitor-logs/)
- Drops PII and identifiers: removes server_context, sessionId, userAgent, headers, profile
- Replaces PII with anonymized derived fields:
- sessionId → deterministic anonymous IDs (anon_0001, anon_0002, …)
- server_context → {origin: <geographic classification>} (IP removed)
- payload.userAgent → device class label on sessionStart events
- Drops payload.headers, payload.profile
- Skips userProfile events entirely
- Preserves analysis value: keeps eventType, timestamp, payload.query/response and other non-PII fields
- Writes sanitized JSON files to an output tree mirroring YYYY/MM/DD
Expand All @@ -19,7 +23,7 @@
--zip

Notes:
- This tool is intentionally dependency-light (stdlib only).
- Requires classifier.py (sibling module) for IP/UA classification.
- Timestamps are preserved as-is when present; otherwise, fallback to filename or file mtime.
- Filenames in the anonymized dataset do not carry session identifiers.

Expand All @@ -39,6 +43,10 @@
from pathlib import Path
from typing import Any

# Ensure sibling modules are importable
sys.path.insert(0, str(Path(__file__).resolve().parent))
from classifier import classify_ip, classify_ua

# Matches original filenames like:
# session_<key>-20250708T131401123Z-request.json
FILENAME_RE = re.compile(
Expand Down Expand Up @@ -108,17 +116,37 @@ def is_user_profile_event(event_type: str) -> bool:
return event_type == "userProfile"


REDACT_PAYLOAD_KEYS = {"userAgent", "headers", "profile"}
REDACT_PAYLOAD_KEYS = {"headers", "profile"}


def redact_document(doc: dict[str, Any], anon_session_id: str) -> dict[str, Any]:
"""
Replace PII with anonymized derived fields.

def redact_document(doc: dict[str, Any]) -> dict[str, Any]:
"""Remove PII from document: server_context, sessionId, and sensitive payload keys."""
# Copy shallowly to avoid mutating input
- sessionId → deterministic anonymous ID
- server_context → {"origin": <classified>} (IP removed)
- payload.userAgent → classified label on sessionStart events
- payload.headers, payload.profile → dropped
"""
red: dict[str, Any] = {
k: v for k, v in doc.items() if k not in {"server_context", "sessionId"}
}
# Anonymized session identifier
red["sessionId"] = anon_session_id

# Replace server_context with origin classification only
ctx = doc.get("server_context", {})
client_ip = ctx.get("client_ip", "") if isinstance(ctx, dict) else ""
red["server_context"] = {"origin": classify_ip(client_ip)}

# Process payload
payload = red.get("payload")
if isinstance(payload, dict):
# Replace userAgent with classification on sessionStart events
ua = payload.pop("userAgent", None)
if ua and doc.get("eventType") == "sessionStart":
payload["userAgent"] = classify_ua(ua)
# Drop remaining PII keys
for k in list(payload.keys()):
if k in REDACT_PAYLOAD_KEYS:
payload.pop(k, None)
Expand Down Expand Up @@ -177,7 +205,7 @@ def write_metadata(
"created_at": finished_at,
"started_at": started_at,
"version": 1,
"notes": "Anonymized dataset with PII removed: server_context, sessionId, userAgent, headers, profile. userProfile events omitted.",
"notes": "Anonymized dataset: sessionId replaced with anon IDs, server_context replaced with origin classification, userAgent replaced with device class. headers/profile dropped. userProfile events omitted.",
"stats": {
"total_files_seen": stats.total_files,
"processed_files": stats.processed,
Expand All @@ -186,8 +214,18 @@ def write_metadata(
"wrote_files": stats.wrote_files,
},
"schema": {
"root_keys": ["timestamp", "eventType", "payload"],
"dropped_root_keys": ["server_context", "sessionId"],
"root_keys": [
"timestamp",
"eventType",
"sessionId",
"server_context",
"payload",
],
"anonymized_fields": {
"sessionId": "replaced with anon_NNNN",
"server_context": "replaced with {origin: <classification>}",
"payload.userAgent": "replaced with device class on sessionStart",
},
"dropped_payload_keys": sorted(REDACT_PAYLOAD_KEYS),
},
"license": {
Expand Down Expand Up @@ -219,16 +257,97 @@ def write_schema_doc(out_root: Path) -> None:
"Each JSON file contains:\n",
"- `timestamp` (string, ISO 8601, Z)\n",
"- `eventType` (string)\n",
"- `payload` (object; userAgent/headers/profile omitted if originally present)\n",
"- `sessionId` (string, anonymized: `anon_NNNN`)\n",
"- `server_context` (object: `{origin: <classification>}`, IP removed)\n",
"- `payload` (object; headers/profile dropped,"
" userAgent replaced with device class on sessionStart)\n",
"\n",
"Removed fields: `server_context`, `sessionId`.\n",
"Omitted events: `userProfile`.\n",
"\n",
"Filenames: `<timestamp>-<eventType>-<seq>.json` grouped under `YYYY/MM/DD/`.\n",
]
(out_root / "SCHEMA.md").write_text("".join(lines), encoding="utf-8")


def write_readme(out_root: Path, stats: RedactionStats) -> None:
"""Write README.md describing the dataset for standalone distribution (e.g. Zenodo)."""
text = f"""\
# CIRED.digital user interaction logs (anonymized)

Anonymized event logs from [CIRED.digital](https://cired.digital), a conversational
assistant for exploring the scientific publications of
[CIRED](https://www.centre-cired.fr) hosted on [HAL](https://hal.science).

## Coverage

- **Period**: June 23 – October 9, 2025
- **Events**: {stats.wrote_files} anonymized JSON files
- **Source files processed**: {stats.total_files} (of which {stats.skipped_user_profile} userProfile events omitted,
{stats.parse_errors} parse errors)

## Structure

```
README.md This file
METADATA.json Dataset version, stats, schema, license
SCHEMA.md JSON field documentation
LICENSE-DATASET.txt CC BY 4.0 terms
CHECKSUMS.sha256 SHA-256 integrity checksums
YYYY/MM/DD/ Event files grouped by date
<timestamp>-<eventType>-<seq>.json
```

### Event types

| eventType | Description |
|-------------------|------------------------------------------|
| sessionStart | User opens the application |
| request | User submits a query |
| response | System returns RAG results |
| article | System generates a synthesized article |
| feedback | User gives thumbs up/down |
| btnClick | UI button interaction |
| visibilityChange | Browser tab shown/hidden |

### JSON schema

Each file contains:
- `timestamp` (string, ISO 8601 UTC)
- `eventType` (string)
- `sessionId` (string, anonymized: `anon_NNNN`)
- `server_context` (object: `{{"origin": "<classification>"}}`, IP address removed)
- `payload` (object, content varies by event type)

## Anonymization

The following transformations protect user privacy while preserving analytical value:
- `sessionId` — replaced with deterministic anonymous IDs (`anon_0001`, …)
- `server_context` — IP address replaced with geographic/org classification only
- `payload.userAgent` — replaced with device class (e.g. "Desktop") on sessionStart events
- `payload.headers`, `payload.profile` — dropped entirely
- All `userProfile` events — dropped entirely

Query text and response content are preserved for research purposes.

## License

This dataset is released under the
[Creative Commons Attribution 4.0 International (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/)
license.

## Citation

> Ha-Duong, M. (2025). CIRED.digital user interaction logs (anonymized) [Dataset].
> https://doi.org/FIXME

## Contact

Minh Ha-Duong <minh.ha-duong@cnrs.fr>
CIRED — Centre international de recherche sur l'environnement et le développement
"""
(out_root / "README.md").write_text(text, encoding="utf-8")


def generate_checksums(out_root: Path) -> None:
"""Generate CHECKSUMS.sha256 file for all JSON outputs."""
entries: list[tuple[str, str]] = []
Expand Down Expand Up @@ -256,6 +375,8 @@ def anonymize_tree(
"""Walk input tree, redact and output anonymized events; return stats."""
stats = RedactionStats()
seq_by_dir: dict[Path, int] = {}
session_id_map: dict[str, str] = {}
session_counter = 0

for src in iter_json_files(in_root):
stats.total_files += 1
Expand Down Expand Up @@ -286,7 +407,13 @@ def anonymize_tree(
.replace("+00:00", "Z")
)

red = redact_document(doc)
# Assign a stable anonymous session ID
orig_sid = doc.get("sessionId", key_f or "unknown")
if orig_sid not in session_id_map:
session_counter += 1
session_id_map[orig_sid] = f"anon_{session_counter:04d}"

red = redact_document(doc, session_id_map[orig_sid])
red["eventType"] = event_type # ensure consistent typing
red["timestamp"] = ts # ensure normalized

Expand Down Expand Up @@ -358,6 +485,7 @@ def main() -> None:

write_schema_doc(out_root)
write_license(out_root)
write_readme(out_root, stats)
write_metadata(
out_root,
stats,
Expand Down
19 changes: 14 additions & 5 deletions src/analysis/logloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
import pandas as pd
from classifier import classify_ip, classify_ua

DEFAULT_BASE_PATH = Path(__file__).resolve().parents[2] / "reports" / "monitor-logs"
DEFAULT_BASE_PATH = Path(
os.environ.get(
"LOGS_ROOT",
str(Path(__file__).resolve().parents[2] / "reports" / "monitor-logs"),
)
)
DEFAULT_MIN_DATE = "20250705"


Expand Down Expand Up @@ -159,18 +164,22 @@ def augment_dataframe(events_df: pd.DataFrame) -> None:
event_types: list[str] = events_df["eventType"].tolist()
payloads: list[dict[str, Any]] = events_df["payload"].tolist()

# The client IP from the server_context
# The client IP from the server_context (empty for anonymized data)
ip_values: list[str] = [ctx.get("client_ip", "") for ctx in server_contexts]
events_df["ip"] = ip_values
# The origin label classified from the IP
events_df["origin"] = [classify_ip(ip) for ip in ip_values]
# The origin label: use pre-computed if available, otherwise classify from IP
events_df["origin"] = [
ctx.get("origin") or classify_ip(ip)
for ctx, ip in zip(server_contexts, ip_values)
]
# The user agent from payload for "sessionStart" event types
ua_values: list[str | None] = [
payload.get("userAgent") if etype == "sessionStart" else None
for etype, payload in zip(event_types, payloads)
]
events_df["ua"] = ua_values
# The classified user agent label
# The classified user agent label: if ua is already a class label (anonymized),
# classify_ua will return it as-is or map it; for raw ua strings it classifies.
events_df["ua_class"] = [
classify_ua(ua) if ua is not None else "??" for ua in ua_values
]
Expand Down
2 changes: 1 addition & 1 deletion src/analysis/tabulate_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def analyze_articles(logs_root: Path) -> pd.DataFrame:
records = []

# Find all article files
article_files = list(logs_root.rglob("*-article.json"))
article_files = list(logs_root.rglob("*-article*.json"))
print(f"Found {len(article_files)} article files")

for article_file in article_files:
Expand Down
4 changes: 2 additions & 2 deletions src/analysis/tabulate_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def find_corresponding_request(response_file: Path, query_id: str) -> Path | Non
parent_dir = response_file.parent

# Try to find request file with the same query ID in the name or content
for request_file in parent_dir.glob("*-request.json"):
for request_file in parent_dir.glob("*-request*.json"):
try:
data = json.loads(request_file.read_text(encoding="utf-8"))
if data.get("payload", {}).get("queryId") == query_id:
Expand All @@ -147,7 +147,7 @@ def analyze_tokens(logs_root: Path) -> pd.DataFrame:
records = []

# Find all response files
response_files = list(logs_root.rglob("*-response.json"))
response_files = list(logs_root.rglob("*-response*.json"))
print(f"Found {len(response_files)} response files")

for response_file in response_files:
Expand Down