From 79abb3c6fd268fee477c8251de7227fde6325a56 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:26:19 +0530 Subject: [PATCH 1/7] rss-bot: Rename feed_file to feed_hashes_file. rss-bot had 2 different feed_file variables: 1. The user provided file with the list of feed URLs. 2. The file for each feed URL, to store the feed entries' hashes. To clearly differentiate between them, the latter has been renamed to feed_hashes_file. --- zulip/integrations/rss/rss-bot | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 49c82fb62..faca40b51 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -209,10 +209,12 @@ client: zulip.Client = zulip.Client( first_message = True for feed_url in feed_urls: - feed_file = os.path.join(opts.data_dir, urllib.parse.urlparse(feed_url).netloc) # Type: str + feed_hashes_file = os.path.join( + opts.data_dir, urllib.parse.urlparse(feed_url).netloc + ) # Type: str try: - with open(feed_file) as f: + with open(feed_hashes_file) as f: old_feed_hashes = {line.strip(): True for line in f.readlines()} except OSError: old_feed_hashes = {} @@ -256,7 +258,7 @@ for feed_url in feed_urls: new_hashes.append(entry_hash) first_message = False - with open(feed_file, "a") as f: + with open(feed_hashes_file, "a") as f: for hash in new_hashes: f.write(hash + "\n") From e793b3f8bb6803d6a1da29719f101490686f3f2b Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:30:41 +0530 Subject: [PATCH 2/7] rss-bot: Assign feed_name only once per feed URL. Previously, it was being set for every entry. --- zulip/integrations/rss/rss-bot | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index faca40b51..c58b57595 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -221,6 +221,7 @@ for feed_url in feed_urls: new_hashes: List[str] = [] data = feedparser.parse(feed_url) + feed_name: str = data.feed.title or feed_url for entry in data.entries: entry_hash = compute_entry_hash(entry) @@ -243,8 +244,6 @@ for feed_url in feed_urls: # entries in reverse chronological order. break - feed_name: str = data.feed.title or feed_url - response: Dict[str, Any] = send_zulip(entry, feed_name) if response["result"] != "success": logger.error("Error processing %s", feed_url) From 8077d37e9479f3352003840f85442fecf5ad3acf Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 21 Nov 2024 20:52:57 +0530 Subject: [PATCH 3/7] rss-bot: Introduce a max_batch_size option to prevent spamming messages. --- zulip/integrations/rss/rss-bot | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index c58b57595..61322dbda 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -92,6 +92,14 @@ parser.add_argument( help="Convert $ to $$ (for KaTeX processing)", default=False, ) +parser.add_argument( + "--max-batch-size", + dest="max_batch_size", + type=int, + help="The maximum number of messages to send at once", + default=3, + action="store", +) opts = parser.parse_args() @@ -239,9 +247,9 @@ for feed_url in feed_urls: if entry_hash in old_feed_hashes: # We've already seen this. No need to process any older entries. break - if not old_feed_hashes and len(new_hashes) >= 3: - # On a first run, pick up the 3 most recent entries. An RSS feed has - # entries in reverse chronological order. + if not old_feed_hashes and len(new_hashes) >= opts.max_batch_size: + # On a first run, pick up the n (= opts.max_batch_size) most recent entries. + # An RSS feed has entries in reverse chronological order. break response: Dict[str, Any] = send_zulip(entry, feed_name) From ea360c3d952e27ca4acaa0a44915fbd52e6c62de Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Fri, 22 Nov 2024 15:03:37 +0530 Subject: [PATCH 4/7] rss-bot: Introduce an earliest_entry_age option to establish a cutoff. Replaced the OLDNESS_THRESHOLD constant that was being used for the same. --- zulip/integrations/rss/rss-bot | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 61322dbda..c4904a942 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -23,7 +23,6 @@ import zulip VERSION = "0.9" RSS_DATA_DIR = os.path.expanduser(os.path.join("~", ".cache", "zulip-rss")) -OLDNESS_THRESHOLD = 30 usage = """Usage: Send summaries of RSS entries for your favorite feeds to Zulip. @@ -100,6 +99,14 @@ parser.add_argument( default=3, action="store", ) +parser.add_argument( + "--earliest-entry-age", + dest="earliest_entry_age", + type=int, + help="The earliest date (relative to today) you want to process entries from (in days)", + default=30, + action="store", +) opts = parser.parse_args() @@ -239,7 +246,7 @@ for feed_url in feed_urls: ) if ( entry_time is not None - and time.time() - calendar.timegm(entry_time) > OLDNESS_THRESHOLD * 60 * 60 * 24 + and time.time() - calendar.timegm(entry_time) > opts.earliest_entry_age * 60 * 60 * 24 ): # As a safeguard against misbehaving feeds, don't try to process # entries older than some threshold. From e6cd92a6a6fa68eaedddcc987d1fac79bf058f96 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:43:11 +0530 Subject: [PATCH 5/7] rss-bot: Split out get_entry_time() and entry_threshold. We will be using entry time for sorting entries in the following commits. --- zulip/integrations/rss/rss-bot | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index c4904a942..414183b52 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -14,7 +14,7 @@ import sys import time import urllib.parse from html.parser import HTMLParser -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List import feedparser from typing_extensions import override @@ -187,6 +187,11 @@ def elide_subject(subject: str) -> str: return subject +def get_entry_time(entry: Any) -> tuple[float, bool]: + entry_time = entry.get("published_parsed", entry.get("updated_parsed")) + return (calendar.timegm(entry_time), True) if entry_time else (float("-inf"), False) + + def send_zulip(entry: Any, feed_name: str) -> Dict[str, Any]: body: str = entry.summary if opts.unwrap: @@ -237,17 +242,13 @@ for feed_url in feed_urls: new_hashes: List[str] = [] data = feedparser.parse(feed_url) feed_name: str = data.feed.title or feed_url + # Safeguard to not process older entries in unordered feeds + entry_threshold = time.time() - opts.earliest_entry_age * 60 * 60 * 24 for entry in data.entries: entry_hash = compute_entry_hash(entry) - # An entry has either been published or updated. - entry_time: Optional[Tuple[int, int]] = entry.get( - "published_parsed", entry.get("updated_parsed") - ) - if ( - entry_time is not None - and time.time() - calendar.timegm(entry_time) > opts.earliest_entry_age * 60 * 60 * 24 - ): + entry_time, is_time_tagged = get_entry_time(entry) + if (is_time_tagged and entry_time < entry_threshold) or entry_hash in old_feed_hashes: # As a safeguard against misbehaving feeds, don't try to process # entries older than some threshold. continue From 761e27d542acf24a83179232349284b36df15eef Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Thu, 18 Dec 2025 10:45:04 +0530 Subject: [PATCH 6/7] rss-bot: Support unordered RSS feeds. By splitting the logic into two loops - one for processing all the entries in the feed, and another to post only the latest ones in chronological order. Instead of tracking new_hashes in memory while processing the feed file, we track unhashed_entries now, since we will not be hashing all the entries, only the ones that we post. Fixes #831. --- zulip/integrations/rss/rss-bot | 56 ++++++++++++++++------------------ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 414183b52..7cb02146a 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -226,8 +226,6 @@ client: zulip.Client = zulip.Client( client="ZulipRSS/" + VERSION, ) -first_message = True - for feed_url in feed_urls: feed_hashes_file = os.path.join( opts.data_dir, urllib.parse.urlparse(feed_url).netloc @@ -239,7 +237,7 @@ for feed_url in feed_urls: except OSError: old_feed_hashes = {} - new_hashes: List[str] = [] + unhashed_entries: List[tuple[Any, str, float]] = [] data = feedparser.parse(feed_url) feed_name: str = data.feed.title or feed_url # Safeguard to not process older entries in unordered feeds @@ -249,32 +247,32 @@ for feed_url in feed_urls: entry_hash = compute_entry_hash(entry) entry_time, is_time_tagged = get_entry_time(entry) if (is_time_tagged and entry_time < entry_threshold) or entry_hash in old_feed_hashes: - # As a safeguard against misbehaving feeds, don't try to process - # entries older than some threshold. continue - if entry_hash in old_feed_hashes: - # We've already seen this. No need to process any older entries. - break - if not old_feed_hashes and len(new_hashes) >= opts.max_batch_size: - # On a first run, pick up the n (= opts.max_batch_size) most recent entries. - # An RSS feed has entries in reverse chronological order. - break - - response: Dict[str, Any] = send_zulip(entry, feed_name) - if response["result"] != "success": - logger.error("Error processing %s", feed_url) - logger.error("%s", response) - if first_message: - # This is probably some fundamental problem like the stream not - # existing or something being misconfigured, so bail instead of - # getting the same error for every RSS entry. - log_error_and_exit("Failed to process first message") - # Go ahead and move on -- perhaps this entry is corrupt. - new_hashes.append(entry_hash) - first_message = False + unhashed_entries.append((entry, entry_hash, entry_time)) - with open(feed_hashes_file, "a") as f: - for hash in new_hashes: - f.write(hash + "\n") + # We process all entries to support unordered feeds, + # but post only the latest ones in chronological order. + sorted_entries = sorted(unhashed_entries, key=lambda x: x[2])[-opts.max_batch_size :] - logger.info("Sent zulips for %d %s entries", len(new_hashes), feed_url) + with open(feed_hashes_file, "a") as f: + for entry_tuple in sorted_entries: + entry, entry_hash, _ = entry_tuple + + response: Dict[str, Any] = send_zulip(entry, feed_name) + if response["result"] != "success": + logger.error("Error processing %s", feed_url) + logger.error("%s", response) + if not old_feed_hashes and entry_tuple == sorted_entries[0]: + # This is probably some fundamental problem like the stream not + # existing or something being misconfigured, so bail instead of + # getting the same error for every RSS entry. + log_error_and_exit("Failed to process first message") + # Go ahead and move on -- perhaps this entry is corrupt. + f.write(entry_hash + "\n") + + logger.info( + "Processed %d entries from %s and sent %d zulips", + len(unhashed_entries), + feed_url, + len(sorted_entries), + ) From 91b42ae18a73eadf286ee321ccf1c9341d6d5b73 Mon Sep 17 00:00:00 2001 From: Niloth P <20315308+Niloth-p@users.noreply.github.com> Date: Tue, 20 Jan 2026 06:07:54 +0530 Subject: [PATCH 7/7] rss-bot: Handle RSS feeds without the title tag safely. Co-authored-by: Pritesh-30 --- zulip/integrations/rss/rss-bot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zulip/integrations/rss/rss-bot b/zulip/integrations/rss/rss-bot index 7cb02146a..5ab920c24 100755 --- a/zulip/integrations/rss/rss-bot +++ b/zulip/integrations/rss/rss-bot @@ -239,7 +239,7 @@ for feed_url in feed_urls: unhashed_entries: List[tuple[Any, str, float]] = [] data = feedparser.parse(feed_url) - feed_name: str = data.feed.title or feed_url + feed_name: str = getattr(data.feed, "title", None) or feed_url # Safeguard to not process older entries in unordered feeds entry_threshold = time.time() - opts.earliest_entry_age * 60 * 60 * 24