Common_Chronicle/config.env.example at main · Intelligent-Internet/Common_Chronicle · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Common Timeline Configuration Example
# Copy this file to .env and fill in your actual values

# ===== OpenAI Configuration =====
# OpenAI API key for accessing OpenAI services
OPENAI_API_KEY=your_openai_api_key_here

# OpenAI API base URL (optional, defaults to https://api.openai.com/v1)
OPENAI_BASE_URL=https://api.openai.com/v1

# Default OpenAI model to use
DEFAULT_OPENAI_MODEL=deepseek-chat

# ===== Google Gemini Configuration =====
# Google Gemini API key
GEMINI_API_KEY=your_gemini_api_key_here

# Default Gemini model to use
DEFAULT_GEMINI_MODEL=gemini-2.0-flash-lite

# ===== Ollama Configuration =====
# Ollama API base URL (for local models)
OLLAMA_BASE_URL=http://localhost:11434

# ===== LLM Provider Configuration =====
# Default LLM provider to use (openai, gemini, ollama)
DEFAULT_LLM_PROVIDER=openai

# ===== Database Configuration =====
# Database schema name
COMMON_CHRONICLE_SCHEMA=common_chronicle_test

# Application database URL
COMMON_CHRONICLE_DATABASE_URL=postgresql+asyncpg://user:password@localhost:5432/common_chronicle
COMMON_CHRONICLE_DATABASE_URL_FOR_ALEMBIC=postgresql://user:password@localhost:5432/common_chronicle

# Dataset database URL (optional)
COMMON_CHRONICLE_DATASET_URL=postgresql+asyncpg://user:password@localhost:5432/common_chronicle_dataset

# ===== Timeout Configuration =====
# LLM extraction timeout in seconds
LLM_TIMEOUT_EXTRACT=120

# ===== LLM Token Configuration =====
# Maximum tokens for event extraction tasks
LLM_EVENT_EXTRACTION_MAX_TOKENS=32000

# Maximum tokens for event extraction retry attempts
LLM_EVENT_EXTRACTION_RETRY_MAX_TOKENS=65536

# Default maximum tokens for general LLM calls
LLM_DEFAULT_MAX_TOKENS=12800

# ===== Text Chunking Configuration =====
# Text length threshold to trigger chunking strategy
TEXT_CHUNK_SIZE_THRESHOLD=16000

# Size of each text chunk in characters
TEXT_CHUNK_SIZE=10000

# Overlap between chunks in characters
TEXT_CHUNK_OVERLAP=1000

# Timeline generation timeout in seconds (default 10 minutes)
TIMELINE_GENERATION_TIMEOUT_SECONDS=600

# Single article processing timeout in seconds (default 2 minutes)
SINGLE_ARTICLE_TIMEOUT_SECONDS=120

# Minimum successful articles required to continue processing
MIN_SUCCESSFUL_ARTICLES_THRESHOLD=1

# ===== Wikipedia API Configuration =====
# Maximum number of retries for Wikipedia API calls
MAX_WIKI_RETRIES=5

# Initial delay between Wikipedia API retries in seconds
INITIAL_WIKI_RETRY_DELAY=0.5

# Maximum concurrent Wikipedia API requests
WIKI_API_SEMAPHORE_LIMIT=5

# Wikipedia API timeout (connection, read) in seconds
# Format: comma-separated values for tuple parsing
WIKI_API_TIMEOUT="5.0,60.0"

# User-Agent string for Wikipedia API compliance
WIKI_API_USER_AGENT="CommonChronicleProject/0.1 (Generic Bot; contact: unavailable)"

# ===== Feature Flags =====
# Whether to enable the event merging step in timeline generation
ENABLE_EVENT_MERGER=true

# Whether to enable reuse of existing synthetic viewpoints to avoid redundant processing
REUSE_COMPOSITE_VIEWPOINT=true

# Whether to enable reuse of existing canonical viewpoints to avoid redundant processing
REUSE_BASE_VIEWPOINT=true

# ===== Article Processing Configuration =====
# Default number of articles to process
DEFAULT_ARTICLE_LIMIT=10

# Task Management Configuration
# Hours after which a processing task is considered stuck
STUCK_TASK_TIMEOUT_HOURS=1

# ===== Timeline Orchestrator Configuration =====
# Relevance threshold for timeline orchestrator initialization
TIMELINE_RELEVANCE_THRESHOLD=0.6

# Batch size for timeline processing
TIMELINE_BATCH_SIZE=50

# Relevance threshold for event merger service
EVENT_MERGER_RELEVANCE_THRESHOLD=0.45

# ===== Event Merger Algorithm Configuration =====
# Minimum number of common entities required for LLM matching
EVENT_MERGER_MIN_COMMON_ENTITIES=1

# Minimum match score threshold for LLM consideration
EVENT_MERGER_LLM_SCORE_THRESHOLD=15.0

# Entity overlap ratio threshold for rule-based matching
EVENT_MERGER_RULE_OVERLAP_RATIO=0.75

# Number of candidate groups to process concurrently in LLM matching
EVENT_MERGER_CONCURRENT_WINDOW_SIZE=3

# Maximum number of concurrent LLM requests across all event processing
EVENT_MERGER_MAX_CONCURRENT_REQUESTS=10

# ===== Event Merger Embedding Configuration =====
# Enable embedding-based event merging for better performance
EVENT_MERGER_USE_EMBEDDING=true

# SentenceTransformer model name for event embedding (768 dimensions)
EVENT_MERGER_EMBEDDING_MODEL=Snowflake/snowflake-arctic-embed-m-v2.0

# Cosine similarity threshold for embedding-based merging
EVENT_MERGER_EMBEDDING_SIMILARITY_THRESHOLD=0.80

# Maximum number of embeddings to cache in memory
EVENT_MERGER_EMBEDDING_CACHE_SIZE=10000

# Use hybrid mode: embedding for bulk + LLM for uncertain cases
EVENT_MERGER_HYBRID_MODE=true

# High similarity threshold for skipping LLM in hybrid mode
EVENT_MERGER_HYBRID_LLM_THRESHOLD=0.90

# ===== Embedding Model Trust Configuration =====
# Whether to trust remote code when loading embedding models (required for Snowflake model)
TRUST_REMOTE_CODE_FOR_EMBEDDINGS=true

# Relevance threshold for filtering articles by relevance
ARTICLE_FILTER_RELEVANCE_THRESHOLD=0.35

# ===== Server Configuration =====
# Server host address
SERVER_HOST=0.0.0.0

# Server port number
SERVER_PORT=8080
MCP_SERVER_PORT = 8081

# Number of uvicorn workers
SERVER_WORKERS=1

# ===== CORS Configuration =====
# CORS allowed origins
CORS_ALLOW_ORIGINS='["http://localhost:5173","http://localhost:3000","http://127.0.0.1:5173","https://your-domain.com"]'
# Whether to allow credentials in CORS requests
CORS_ALLOW_CREDENTIALS=true

# CORS allowed methods (comma-separated)
CORS_ALLOW_METHODS=*

# CORS allowed headers (comma-separated)
CORS_ALLOW_HEADERS=*