diff --git a/app/README.md b/app/README.md index bd1d03b8..945e85de 100644 --- a/app/README.md +++ b/app/README.md @@ -8,6 +8,8 @@ Python 3.X pip install -r app/requirements.txt ``` +## On Linux + ### Running To run anything that runs a Flask app, for example @@ -16,6 +18,26 @@ cd app gunicorn --workers 1 --threads 4 sw:app ``` +## On Windows + +```bash +cd app +waitress-serve --host=127.0.0.1 --port=8000 sw:app +``` +# Take note on windows⚠️ +### From +```python +with open("myfile.txt", "r") as f: + data = f.read() +``` +## to + +### +```python +with open("myfile.txt", "r", encoding="utf-8") as f: + data = f.read() +``` + This will run the Small Web frontend app locally. Open http://127.0.0.1:8000 in browser to access. diff --git a/app/requirements.txt b/app/requirements.txt index aa36165f..f925a4ae 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -20,4 +20,5 @@ urllib3==2.6.3 Werkzeug==3.0.6 feedwerk==1.1.0 python-dateutil -pyopml==1.0.0 \ No newline at end of file +pyopml==1.0.0 +waitress==3.0.2 diff --git a/app/sw.py b/app/sw.py index 415af434..f794db50 100644 --- a/app/sw.py +++ b/app/sw.py @@ -41,7 +41,7 @@ def compute_appreciated_version(urls_list): """Compute sha1 hash of sorted URLs for version tracking. - + The version changes whenever the appreciated feed contents change (add/remove URLs, or if the canonical ordering changes). """ @@ -52,7 +52,7 @@ def compute_appreciated_version(urls_list): def generate_appreciated_json(): """Generate and cache JSON representation of appreciated feed. - + Response format: { "version": "abc123...", # sha1 hash of sorted URLs @@ -63,10 +63,10 @@ def generate_appreciated_json(): } """ global appreciated_version, appreciated_json_cache, appreciated_json_gzip - + # Compute version from current appreciated list appreciated_version = compute_appreciated_version(urls_app_cache) - + # Build the urls array with minimal data per item urls_array = [] for idx, entry in enumerate(urls_app_cache): @@ -79,17 +79,17 @@ def generate_appreciated_json(): "title": title or "", "author": author or "", }) - + # Build response object response_data = { "version": appreciated_version, "urls": urls_array, } - + # Cache JSON and gzipped version appreciated_json_cache = json.dumps(response_data, separators=(",", ":")) appreciated_json_gzip = gzip.compress(appreciated_json_cache.encode("utf-8")) - + return appreciated_json_cache @@ -110,7 +110,7 @@ def generate_appreciated_feed(): updated=updated, author=author, ) - + # Also regenerate JSON cache when feed changes generate_appreciated_json() @@ -193,6 +193,44 @@ def time_ago(timestamp): master_feed = False +# Simple bot detection state +_bot_detection = {} + +RATE_LIMIT = 30 # max requests per window +WINDOW_SECONDS = 60 +BLOCK_SECONDS = 300 + +def is_bot_request(): + # Determine if request is from a bot by UA and rate limit per IP + ip = request.remote_addr or request.headers.get('X-Forwarded-For', '') + ua = request.headers.get('User-Agent', '') or '' + ua_lc = ua.lower() + bot_indicators = ['bot','crawl','spider','slurp','libcurl','wget','curl','python-urllib','httpclient','semrush','ahrefs'] + if not ua or any(b in ua_lc for b in bot_indicators): + return True + # simple rate limit per IP + now = datetime.utcnow() + rec = _bot_detection.get(ip) + if rec is None: + rec = {'start': now, 'count': 0, 'blocked_until': None} + _bot_detection[ip] = rec + if rec.get('blocked_until') and now < rec['blocked_until']: + return True + if (now - rec['start']).total_seconds() > WINDOW_SECONDS: + rec['start'] = now + rec['count'] = 0 + rec['count'] += 1 + if rec['count'] > RATE_LIMIT: + rec['blocked_until'] = now + timedelta(seconds=BLOCK_SECONDS) + return True + return False + +@app.before_request +def _bot_before_request(): + if is_bot_request(): + return make_response('Bot detected', 429) + + def update_all(): global urls_cache, urls_app_cache, urls_yt_cache, urls_gh_cache, urls_comic_cache, urls_flagged_cache, master_feed, favorites_dict, appreciated_feed @@ -300,7 +338,7 @@ def update_entries(url): def load_public_suffix_list(file_path): public_suffix_list = set() - with open(file_path, "r") as f: + with open(file_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line and not line.startswith("//"): @@ -452,7 +490,7 @@ def index(): # get flagged content flag_content_count = flagged_content_dict.get(url, 0) - + if url.startswith("http://"): url = url.replace( @@ -525,7 +563,7 @@ def favorite(): # Update urls_app_cache with the new favorite from both regular and YouTube feeds urls_app_cache = [e for e in (urls_cache + urls_yt_cache) if e[0] in favorites_dict] - + # Regenerate the appreciated feed generate_appreciated_feed() @@ -637,10 +675,10 @@ def appreciated(): @app.route("/smallweb/appreciated.json") def appreciated_json(): """Full appreciated feed as JSON for client-side random selection. - + Returns the complete list of appreciated URLs with version info. Supports ETag for conditional requests (304 Not Modified). - + Response: { "version": "abc123...", @@ -651,11 +689,11 @@ def appreciated_json(): } """ global appreciated_version, appreciated_json_cache, appreciated_json_gzip - + # Ensure cache exists if appreciated_json_cache is None: generate_appreciated_json() - + # Check for conditional request (ETag) etag = f'"{appreciated_version}"' if_none_match = request.headers.get("If-None-Match") @@ -664,7 +702,7 @@ def appreciated_json(): response.headers["ETag"] = etag response.headers["Access-Control-Allow-Origin"] = "*" return response - + # Check if client accepts gzip accept_encoding = request.headers.get("Accept-Encoding", "") if "gzip" in accept_encoding and appreciated_json_gzip: @@ -672,7 +710,7 @@ def appreciated_json(): response.headers["Content-Encoding"] = "gzip" else: response = make_response(appreciated_json_cache) - + response.headers["Content-Type"] = "application/json" response.headers["ETag"] = etag response.headers["Cache-Control"] = "public, max-age=300" # cache for 5 min @@ -849,7 +887,7 @@ def save_all_data(): print(f"[DEBUG] Saved {len(flagged_content_dict)} flagged items") except Exception as e: print(f"Error saving flagged content: {e}") - + atexit.register(save_all_data) atexit.register(lambda: scheduler.shutdown())