commitBlogger/ingest.py at main · jaanav34/commitBlogger · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

"""
Ingestion Module: Handles fetching data from GitHub and Notion, and managing processed state.
"""

import os
import re
import json
import logging
from datetime import datetime, timedelta
from typing import Optional
from github import Github, RateLimitExceededException, UnknownObjectException, GithubException
from notion_client import Client
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, before_log, after_log

# Configure logging for this module
logger = logging.getLogger(__name__)

class Ingester:
    """
    Manages data ingestion from GitHub and Notion, and tracks processed items.
    """

    def __init__(self, github_token: str, notion_token: str, state_file: str = 'processed_state.json'):
        """
        Initializes the Ingester with API tokens and state file path.

        Args:
            github_token (str): GitHub personal access token.
            notion_token (str): Notion integration token (optional).
            state_file (str): Path to the JSON file storing processed item SHAs/IDs.
        """
        self.github_client = Github(github_token)
        self.notion_client = Client(auth=notion_token) if notion_token else None
        self.state_file = state_file
        self.processed_shas = self._load_processed_shas()

    def _load_processed_shas(self) -> set:
        """
        Loads the set of already processed SHAs from the state file.

        Returns:
            set: A set of processed SHAs.
        """
        if os.path.exists(self.state_file):
            try:
                with open(self.state_file, 'r') as f:
                    return set(json.load(f))
            except json.JSONDecodeError:
                logger.warning(f"Corrupted state file {self.state_file}. Starting with empty state.")
                return set()
        return set()

    def mark_as_processed(self, sha: str):
        """Adds a SHA to the processed set and saves the state file."""
        self.processed_shas.add(sha)
        self._save_processed_shas()

    def _save_processed_shas(self):
        """
        Saves the current set of processed SHAs to the state file.
        """
        try:
            with open(self.state_file, 'w') as f:
                json.dump(list(self.processed_shas), f, indent=4)
        except IOError as e:
            logger.error(f"Failed to save processed SHAs to {self.state_file}: {e}")

    def _get_notion_page_title(self, page: dict) -> str:
        """
        Extracts the title from a Notion page object by finding the 'title' property type.
        This is a robust way to get the title regardless of its property name.

        Args:
            page (dict): The Notion page object.

        Returns:
            str: The concatenated title text or "Untitled".
        """
        properties = page.get("properties", {})
        for prop_value in properties.values():
            if prop_value.get("type") == "title":
                title_parts = [part.get("plain_text", "") for part in prop_value.get("title", [])]
                return "".join(title_parts)
        return "Untitled"

    @retry(wait=wait_exponential(multiplier=1, min=4, max=10),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitExceededException, GithubException)),
           before_sleep=before_log(logger, logging.INFO),
           after=after_log(logger, logging.WARNING))
    def _get_github_repo(self, repo_name: str):
        """
        Helper to get GitHub repository with retry logic.
        """
        return self.github_client.get_repo(repo_name)

    @retry(wait=wait_exponential(multiplier=1, min=4, max=10),
           stop=stop_after_attempt(5),
           retry=retry_if_exception_type((RateLimitExceededException, GithubException)),
           before_sleep=before_log(logger, logging.INFO),
           after=after_log(logger, logging.WARNING))
    def _get_github_commit(self, repo, sha: str):
        """
        Helper to get a specific GitHub commit with retry logic.
        """
        return repo.get_commit(sha)

    def fetch_github_commits(self, repo_name: str, since_days: int = 0, batch_mode: bool = False) -> list:
        """
        Fetches commits from a GitHub repository.

        Args:
            repo_name (str): The full name of the GitHub repository (e.g., 'owner/repo').
            since_days (int): Number of days to look back for new commits. If 0, fetches all.
            batch_mode (bool): If True, fetches all historical commits regardless of `since_days`
                               and does not filter by processed SHAs. Used for initial setup.

        Returns:
            list: A list of dictionaries, each representing a commit with relevant details.
        """
        logger.info(f"Fetching GitHub commits for {repo_name}...")
        commits_data = []
        try:
            repo = self._get_github_repo(repo_name)

            if batch_mode:
                # Implement pagination for large number of commits in batch mode
                # Iterate through all commits, handling pagination automatically
                commits = repo.get_commits()
            else:
                since_date = datetime.now() - timedelta(days=since_days)
                commits = repo.get_commits(since=since_date)

            for commit in commits:
                if not batch_mode and commit.sha in self.processed_shas:
                    logger.info(f"Skipping already processed commit: {commit.sha[:8]}")
                    continue # Skip already processed commits in incremental mode

                try:
                    # Fetch full commit details to get file changes (diffs)
                    full_commit = self._get_github_commit(repo, commit.sha)
                    files_changed = []
                    for file in full_commit.files:
                        files_changed.append({
                            'filename': file.filename,
                            'status': file.status,
                            'additions': file.additions,
                            'deletions': file.deletions,
                            'changes': file.changes,
                            'raw_url': file.raw_url, # URL to fetch raw content for diff if needed
                            'patch': file.patch # The actual diff content
                        })

                    commits_data.append({
                        'sha': commit.sha,
                        'message': commit.commit.message,
                        'author': commit.commit.author.name,
                        'date': commit.commit.author.date.isoformat(),
                        'url': commit.html_url,
                        'files': files_changed
                    })
                    self.processed_shas.add(commit.sha)
                except UnknownObjectException:
                    logger.warning(f"Commit {commit.sha} not found or accessible. Skipping.")
                except GithubException as e:
                    logger.error(f"GitHub API error processing commit {commit.sha}: {e}")
                except Exception as e:
                    logger.error(f"Unexpected error processing commit {commit.sha}: {e}", exc_info=True)

            logger.info(f"Fetched {len(commits_data)} new/unprocessed commits.")
            # Reverse the list to process commits from oldest to newest
            commits_data.reverse()
            logger.info("Commits sorted chronologically (oldest first).")
            return commits_data
        except GithubException as e:
            logger.critical(f"Failed to fetch GitHub repository {repo_name} due to API error: {e}")
            return []
        except Exception as e:
            logger.critical(f"An unexpected error occurred during GitHub ingestion: {e}", exc_info=True)
            return []

    def fetch_notion_notes(self, database_id: str, since_date: Optional[datetime]) -> dict:
        """
        Fetches notes (pages) from a Notion database.

        Args:
            database_id (str): The ID of the Notion database.
            since_date (Optional[datetime]): Only fetch notes created/updated after this date.

        Returns:
            list: A list of dictionaries, each representing a Notion page with relevant details.
        """
        if not self.notion_client:
            logger.warning("Notion client not initialized. Skipping Notion ingestion.")
            return {}

        logger.info(f"Fetching Notion notes from database {database_id}...")
        notes_data = {}
        try:
            filter_obj = None
            if since_date:
                filter_obj = {
                    "timestamp": "last_edited_time",
                    "last_edited_time": {
                        "on_or_after": since_date.isoformat()
                    }
                }


            # Notion API integration to fetch notes with pagination and filtering
            has_more = True
            start_cursor = None
            while has_more:
                response = self.notion_client.databases.query(
                    database_id=database_id,
                    filter=filter_obj,
                    start_cursor=start_cursor
                )
                for page in response["results"]:  # type: ignore
                    page_id = page["id"]
                    title = self._get_notion_page_title(page)

                    # Find a 7-character commit SHA in the title
                    match = re.search(r'\b([0-9a-f]{7})\b', title, re.IGNORECASE)
                    if not match:
                        logger.info(f"Skipping Notion note '{title}' as it does not contain a 7-character commit SHA.")
                        continue

                    commit_sha_short = match.group(1).lower()
                    logger.info(f"Found commit SHA '{commit_sha_short}' in Notion note '{title}'.")

                    # Fetch block content
                    content_blocks = self.notion_client.blocks.children.list(block_id=page_id)
                    page_content = ""
                    for block in content_blocks["results"]:  # type: ignore
                        if "type" in block and block["type"] == "paragraph" and "rich_text" in block["paragraph"]:
                            for text_obj in block["paragraph"]["rich_text"]:
                                if "plain_text" in text_obj:
                                    page_content += text_obj["plain_text"] + "\n"

                    notes_data[commit_sha_short] = {
                        'id': page_id,
                        'title': title,
                        'content': page_content,
                        'last_edited_time': page["last_edited_time"],
                        'url': page["url"]
                    }

                has_more = response["has_more"] # type: ignore
                start_cursor = response["next_cursor"] # type: ignore

            logger.info(f"Fetched {len(notes_data)} Notion notes linked to commit SHAs.")
            return notes_data
        except Exception as e:
            logger.error(f"Error fetching Notion notes from database {database_id}: {e}", exc_info=True)
            return {}

    def get_processed_shas(self) -> set:
        """
        Returns the set of SHAs that have been processed.
        """
        return self.processed_shas

if __name__ == '__main__':
    # These would typically come from environment variables
    GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
    NOTION_TOKEN = os.getenv('NOTION_TOKEN')
    GITHUB_REPO = os.getenv('GITHUB_REPO', 'your_github_username/your_repo_name')
    NOTION_DATABASE_ID = os.getenv('NOTION_DATABASE_ID', 'your_notion_database_id')

    if not GITHUB_TOKEN:
        print("Please set GITHUB_TOKEN environment variable.")
    else:
        ingester = Ingester(github_token=GITHUB_TOKEN, notion_token=NOTION_TOKEN) # type: ignore

        # Example: Fetch commits from the last 7 days (incremental mode)
        # new_commits = ingester.fetch_github_commits(GITHUB_REPO, since_days=7)
        # print(f"Found {len(new_commits)} new commits.")

        # Example: Fetch all historical commits (batch mode)
        # historical_commits = ingester.fetch_github_commits(GITHUB_REPO, batch_mode=True)
        # print(f"Found {len(historical_commits)} historical commits.")

        # Example: Fetch Notion notes
        # if NOTION_TOKEN and NOTION_DATABASE_ID != 'your_notion_database_id':
        #     notion_notes = ingester.fetch_notion_notes(NOTION_DATABASE_ID, since_date=datetime.now() - timedelta(days=30))
        #     print(f"Found {len(notion_notes)} Notion notes.")
        # else:
        #     print("Notion token or database ID not set. Skipping Notion example.")

        print("Processed SHAs:", ingester.get_processed_shas())