diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index 217f7cb..7fcd6dc 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -19,7 +19,7 @@ jobs: - name: Setup .NET uses: actions/setup-dotnet@v4 with: - dotnet-version: 8.0.x + dotnet-version: 9.0.x - name: Restore dependencies run: dotnet restore - name: Build diff --git a/LinkTracker.sln b/LinkTracker.sln index 802646f..141145b 100644 --- a/LinkTracker.sln +++ b/LinkTracker.sln @@ -11,6 +11,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LinkTracker.Shared", "src\L EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LinkTracker.Scrapper", "src\LinkTracker.Scrapper\LinkTracker.Scrapper.csproj", "{093435F1-2E61-4005-81AE-AC4CFE7C17AE}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{0AB3BF05-4346-4AA6-1389-037BE0695223}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "LinkTracker.Scrapper.Tests", "tests\LinkTracker.Scrapper.Tests\LinkTracker.Scrapper.Tests.csproj", "{1789B984-4451-4B95-8101-AC885C600986}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -57,6 +61,18 @@ Global {093435F1-2E61-4005-81AE-AC4CFE7C17AE}.Release|x64.Build.0 = Release|Any CPU {093435F1-2E61-4005-81AE-AC4CFE7C17AE}.Release|x86.ActiveCfg = Release|Any CPU {093435F1-2E61-4005-81AE-AC4CFE7C17AE}.Release|x86.Build.0 = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|x64.ActiveCfg = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|x64.Build.0 = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|x86.ActiveCfg = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Debug|x86.Build.0 = Debug|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|Any CPU.Build.0 = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|x64.ActiveCfg = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|x64.Build.0 = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|x86.ActiveCfg = Release|Any CPU + {1789B984-4451-4B95-8101-AC885C600986}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -65,5 +81,6 @@ Global {5BDAA58E-C3C7-4A50-B867-04CD893D1D84} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B} {58B21565-1CB0-407F-948D-05F51AF335DF} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B} {093435F1-2E61-4005-81AE-AC4CFE7C17AE} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B} + {1789B984-4451-4B95-8101-AC885C600986} = {0AB3BF05-4346-4AA6-1389-037BE0695223} EndGlobalSection EndGlobal diff --git a/README.md b/README.md index 8d33b81..4fa3454 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,186 @@ -# LinkTracker Bot +# LinkTracker -LinkTracker is a microservice-based system that monitors GitHub repositories and StackOverflow questions, sending real-time update notifications via Telegram. +[![.NET](https://github.com/666mxvbee/LinkTracker/actions/workflows/dotnet.yml/badge.svg)](https://github.com/666mxvbee/LinkTracker/actions/workflows/dotnet.yml) +[![Docker Image CI](https://github.com/666mxvbee/LinkTracker/actions/workflows/docker-image.yml/badge.svg)](https://github.com/666mxvbee/LinkTracker/actions/workflows/docker-image.yml) +[![License](https://img.shields.io/github/license/666mxvbee/LinkTracker)](LICENSE) +[![Release](https://img.shields.io/github/v/release/666mxvbee/LinkTracker?include_prereleases&sort=semver)](https://github.com/666mxvbee/LinkTracker/releases) +[![.NET 9](https://img.shields.io/badge/.NET-9.0-512BD4?logo=dotnet)](https://dotnet.microsoft.com/) +[![PostgreSQL](https://img.shields.io/badge/PostgreSQL-16-4169E1?logo=postgresql&logoColor=white)](https://www.postgresql.org/) +[![Docker](https://img.shields.io/badge/Docker-Compose-2496ED?logo=docker&logoColor=white)](docker-compose.yml) +[![Tests](https://img.shields.io/badge/tests-xUnit%20%2B%20Testcontainers-5A2D82)](tests/LinkTracker.Scrapper.Tests) +[![Platform](https://img.shields.io/badge/platform-Windows%20%7C%20Linux%20%7C%20Docker-lightgrey)](docker-compose.yml) + +LinkTracker is a .NET 9 microservice application for tracking GitHub repositories and StackOverflow questions. The Bot service handles Telegram interaction, and the Scrapper service stores subscriptions, checks links on a schedule, and sends update notifications back to the Bot. ## Project Structure - src/LinkTracker.Bot - ASP.NET Core Web API for Telegram interaction. +```text +src/LinkTracker.Bot Telegram bot HTTP service +src/LinkTracker.Scrapper Subscription storage and scheduled update checker +src/LinkTracker.Shared Shared DTOs +migrations/ SQL migrations applied by Scrapper on startup +``` - src/LinkTracker.Scrapper - Quartz-based worker for resource monitoring. +## Prerequisites - src/LinkTracker.Shared - Common models and DTOs shared between services. +- .NET 9 SDK +- Docker Desktop +- Telegram bot token from BotFather -## Quick Start (Docker) +## Configuration -The easiest way to run the entire infrastructure is using Docker Compose. +Create `.env` in the repository root: -### 1. Prerequisites -* [Docker Desktop](https://www.docker.com/products/docker-desktop/) installed and running. -* A Telegram Bot Token from [@BotFather](https://t.me/botfather). +```env +TELEGRAM_BOT_TOKEN=your-telegram-bot-token -### 2. Configuration -Create a file named `.env` in the root directory of the project: +POSTGRES_DB=linktracker +POSTGRES_USER=linktracker +POSTGRES_PASSWORD=linktracker +``` -```env -TELEGRAM_BOT_TOKEN=your_token_here +`.env` is ignored by git. Use `.env.example` as the template. + +Scrapper database settings are in `src/LinkTracker.Scrapper/appsettings.json`: + +```json +"Database": { + "AccessType": "SQL", + "ConnectionString": "Host=localhost;Port=5433;Database=linktracker;Username=linktracker;Password=linktracker", + "RunMigrations": true +} ``` -Note: The .env file is ignored by git for security purposes. -### 3. Launch +`AccessType` can be: -Run the following command in the root folder: -Bash +```text +SQL raw SQL repositories via Npgsql +ORM EF Core repositories +``` + +Scheduler settings: + +```json +"Scrapper": { + "CheckIntervalSeconds": 30, + "BatchSize": 100, + "Parallelism": 4, + "GitHubBaseUrl": "https://api.github.com/", + "StackOverflowBaseUrl": "https://api.stackexchange.com/2.3/" +} +``` -docker-compose up --build +`BatchSize` is clamped by the application to `50..500`. `Parallelism` controls how many links are processed concurrently. -Once started: +## Run With Docker Compose + +Run all services: + +```powershell +docker compose up --build +``` + +Endpoints: + +```text +Bot API: http://localhost:5100 +Scrapper API: http://localhost:5000 +PostgreSQL: localhost:5433 +``` + +Inside Docker, Scrapper connects to PostgreSQL by service name: + +```text +Host=postgres;Port=5432 +``` + +## Run From IDE + +Start PostgreSQL first: + +```powershell +docker compose up postgres -d +``` + +Then run the services from IDE or terminal: + +```powershell +dotnet run --project src\LinkTracker.Scrapper +dotnet run --project src\LinkTracker.Bot +``` - Bot API: http://localhost:5100 +Scrapper applies SQL migrations from `migrations/` automatically when `Database:RunMigrations` is `true`. + +## Useful Manual Checks + +List database tables: + +```powershell +docker exec -e PGPASSWORD=linktracker linktracker-postgres psql -U linktracker -d linktracker -c "\dt" +``` + +Expected domain tables: + +```text +chats +links +chat_links +tags +chat_link_tags +``` + +DbUp also creates: + +```text +schemaversions +``` + +Open Scrapper Swagger: + +```text +http://localhost:5000/swagger +``` + +Basic API flow: + +```text +POST /tg-chat/{id} +POST /links with Tg-Chat-Id header +GET /links with Tg-Chat-Id header +DELETE /links with Tg-Chat-Id header +GET /tags +POST /tags +PUT /tags/{id} +DELETE /tags/{id} +``` + +## Update Checking + +Scrapper uses Quartz to periodically process tracked links in batches. + +For GitHub links, it detects new: + +```text +Issue +Pull request +``` + +For StackOverflow links, it detects new: + +```text +Answer +Question comment +Answer comment +``` + +Notifications include: + +```text +type of update +title +user name +creation time +text preview limited to 200 characters +``` - Scrapper API: http://localhost:5000 \ No newline at end of file +The notification sender is abstracted behind `IMessageSender`. The current implementation is HTTP from Scrapper to Bot; another implementation such as Kafka can be added later without changing the scheduler business logic. diff --git a/src/LinkTracker.Scrapper/Clients/GitHubClient.cs b/src/LinkTracker.Scrapper/Clients/GitHubClient.cs index b252e2a..9acda1f 100644 --- a/src/LinkTracker.Scrapper/Clients/GitHubClient.cs +++ b/src/LinkTracker.Scrapper/Clients/GitHubClient.cs @@ -1,35 +1,98 @@ using System.Text.Json; +using LinkTracker.Scrapper.Services.Updates; namespace LinkTracker.Scrapper.Clients; -public class GitHubClient(HttpClient httpClient, ILogger logger) +public class GitHubClient(HttpClient httpClient) : ILinkUpdateChecker { - public async Task GetLastUpdate(string owner, string repo) + public bool CanHandle(string url) { - try + return Uri.TryCreate(url, UriKind.Absolute, out var uri) + && uri.Host.Contains("github.com", StringComparison.OrdinalIgnoreCase); + } + + public async Task> CheckUpdatesAsync( + string url, + DateTimeOffset since, + CancellationToken cancellationToken) + { + if (!TryParseRepository(url, out var owner, out var repo)) { - httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("LinkTrackerBot/1.0"); + return []; + } + + var issues = await FetchAsync(owner, repo, "issues", "Issue", since, cancellationToken); + var pulls = await FetchAsync(owner, repo, "pulls", "Pull request", since, cancellationToken); + + return issues + .Concat(pulls) + .OrderBy(update => update.CreatedAt) + .ToArray(); + } + + private async Task> FetchAsync( + string owner, + string repo, + string resource, + string kind, + DateTimeOffset since, + CancellationToken cancellationToken) + { + using var response = await httpClient.GetAsync( + $"repos/{owner}/{repo}/{resource}?state=all&sort=created&direction=desc&per_page=100", + cancellationToken); + + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadFromJsonAsync(cancellationToken); - var response = await httpClient.GetAsync($"https://api.github.com/repos/{owner}/{repo}"); + var updates = new List(); - if (!response.IsSuccessStatusCode) + foreach (var item in json.EnumerateArray()) + { + if (resource == "issues" && item.TryGetProperty("pull_request", out _)) { - logger.LogWarning("GitHub API made error {Code} to {Owner}/{Repo}", response.StatusCode, owner, repo); - return null; + continue; } - var json = await response.Content.ReadFromJsonAsync(); + var createdAt = item.GetProperty("created_at").GetDateTimeOffset(); - if (json.TryGetProperty("pushed_at", out var dateProp)) + if (createdAt <= since) { - return dateProp.GetDateTimeOffset(); + continue; } + + updates.Add(new DetectedLinkUpdate( + Url: item.GetProperty("html_url").GetString() ?? string.Empty, + Kind: kind, + Title: item.GetProperty("title").GetString() ?? string.Empty, + UserName: item.GetProperty("user").GetProperty("login").GetString() ?? "unknown", + CreatedAt: createdAt, + Preview: PreviewBuilder.Build(item.GetProperty("body").GetString()))); } - catch (Exception ex) + + return updates; + } + + private static bool TryParseRepository(string url, out string owner, out string repo) + { + owner = string.Empty; + repo = string.Empty; + + if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) + { + return false; + } + + var parts = uri.AbsolutePath.Trim('/').Split('/', StringSplitOptions.RemoveEmptyEntries); + + if (parts.Length < 2) { - logger.LogError(ex, "Error GitHub API"); + return false; } - return null; + owner = parts[0]; + repo = parts[1]; + return true; } } \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Clients/StackOverflowClient.cs b/src/LinkTracker.Scrapper/Clients/StackOverflowClient.cs index 933a64c..f062e09 100644 --- a/src/LinkTracker.Scrapper/Clients/StackOverflowClient.cs +++ b/src/LinkTracker.Scrapper/Clients/StackOverflowClient.cs @@ -1,39 +1,188 @@ -using System.Text.Json; +using System.Text.Json; +using LinkTracker.Scrapper.Services.Updates; namespace LinkTracker.Scrapper.Clients; -public class StackOverflowClient(HttpClient httpClient, ILogger logger) +public class StackOverflowClient(HttpClient httpClient) : ILinkUpdateChecker { - public async Task GetLastUpdate(long questionId) + public bool CanHandle(string url) { - try + return Uri.TryCreate(url, UriKind.Absolute, out var uri) + && uri.Host.Contains("stackoverflow.com", StringComparison.OrdinalIgnoreCase); + } + + public async Task> CheckUpdatesAsync( + string url, + DateTimeOffset since, + CancellationToken cancellationToken) + { + if (!TryParseQuestionId(url, out var questionId)) + { + return []; + } + + var title = await GetQuestionTitle(questionId, cancellationToken); + + var answers = await FetchQuestionItems(questionId, "answers", "Answer", title, since, cancellationToken); + var questionComments = await FetchQuestionItems( + questionId, + "comments", + "Question comment", + title, + since, + cancellationToken); + var answerComments = await FetchAnswerComments(questionId, title, since, cancellationToken); + + return answers + .Concat(questionComments) + .Concat(answerComments) + .OrderBy(update => update.CreatedAt) + .ToArray(); + } + + private async Task GetQuestionTitle(long questionId, CancellationToken cancellationToken) + { + using var response = await httpClient.GetAsync( + $"questions/{questionId}?site=stackoverflow", + cancellationToken); + + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadFromJsonAsync(cancellationToken); + var items = json.GetProperty("items"); + + return items.GetArrayLength() == 0 + ? $"StackOverflow question {questionId}" + : items[0].GetProperty("title").GetString() ?? $"StackOverflow question {questionId}"; + } + + private async Task> FetchQuestionItems( + long questionId, + string resource, + string kind, + string title, + DateTimeOffset since, + CancellationToken cancellationToken) + { + using var response = await httpClient.GetAsync( + $"questions/{questionId}/{resource}?site=stackoverflow&pagesize=100&order=desc&sort=creation&filter=withbody", + cancellationToken); + + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadFromJsonAsync(cancellationToken); + var updates = new List(); + + foreach (var item in json.GetProperty("items").EnumerateArray()) { - var url = $"https://api.stackexchange.com/2.3/questions/{questionId}?site=stackoverflow"; - - var response = await httpClient.GetAsync(url); - - if (!response.IsSuccessStatusCode) - { - logger.LogWarning("StackOverflow API made error {Code} for {Id}", response.StatusCode, questionId); - return null; - } - - var json = await response.Content.ReadFromJsonAsync(); - - if (json.TryGetProperty("items", out var items) && items.GetArrayLength() > 0) - { - var firstItem = items[0]; - if (firstItem.TryGetProperty("last_activity_date", out var dateProp)) - { - return DateTimeOffset.FromUnixTimeSeconds(dateProp.GetInt64()); - } - } + AddUpdateIfNew(updates, item, questionId, kind, title, since); } - catch (Exception ex) + + return updates; + } + + private async Task> FetchAnswerComments( + long questionId, + string title, + DateTimeOffset since, + CancellationToken cancellationToken) + { + var answerIds = await GetAnswerIds(questionId, cancellationToken); + + if (answerIds.Length == 0) { - logger.LogError(ex, "Error StackOverflow API"); + return []; } - return null; + var ids = string.Join(';', answerIds); + + using var response = await httpClient.GetAsync( + $"answers/{ids}/comments?site=stackoverflow&pagesize=100&order=desc&sort=creation&filter=withbody", + cancellationToken); + + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadFromJsonAsync(cancellationToken); + var updates = new List(); + + foreach (var item in json.GetProperty("items").EnumerateArray()) + { + AddUpdateIfNew(updates, item, questionId, "Answer comment", title, since); + } + + return updates; + } + + private async Task GetAnswerIds(long questionId, CancellationToken cancellationToken) + { + using var response = await httpClient.GetAsync( + $"questions/{questionId}/answers?site=stackoverflow&pagesize=100&order=desc&sort=creation", + cancellationToken); + + response.EnsureSuccessStatusCode(); + + var json = await response.Content.ReadFromJsonAsync(cancellationToken); + + return json.GetProperty("items") + .EnumerateArray() + .Where(item => item.TryGetProperty("answer_id", out _)) + .Select(item => item.GetProperty("answer_id").GetInt64()) + .ToArray(); + } + + private static void AddUpdateIfNew( + ICollection updates, + JsonElement item, + long questionId, + string kind, + string title, + DateTimeOffset since) + { + var createdAt = DateTimeOffset.FromUnixTimeSeconds(item.GetProperty("creation_date").GetInt64()); + + if (createdAt <= since) + { + return; + } + + updates.Add(new DetectedLinkUpdate( + Url: $"https://stackoverflow.com/questions/{questionId}", + Kind: kind, + Title: title, + UserName: GetOwnerName(item), + CreatedAt: createdAt, + Preview: PreviewBuilder.Build(GetStringOrDefault(item, "body")))); + } + + private static string GetOwnerName(JsonElement item) + { + return item.TryGetProperty("owner", out var owner) + && owner.TryGetProperty("display_name", out var displayName) + ? displayName.GetString() ?? "unknown" + : "unknown"; + } + + private static string? GetStringOrDefault(JsonElement item, string propertyName) + { + return item.TryGetProperty(propertyName, out var property) + ? property.GetString() + : null; + } + + private static bool TryParseQuestionId(string url, out long questionId) + { + questionId = 0; + + if (!Uri.TryCreate(url, UriKind.Absolute, out var uri)) + { + return false; + } + + var parts = uri.AbsolutePath.Split('/', StringSplitOptions.RemoveEmptyEntries); + var index = Array.IndexOf(parts, "questions"); + + return index >= 0 + && parts.Length > index + 1 + && long.TryParse(parts[index + 1], out questionId); } -} \ No newline at end of file +} diff --git a/src/LinkTracker.Scrapper/Configuration/ScrapperOptions.cs b/src/LinkTracker.Scrapper/Configuration/ScrapperOptions.cs new file mode 100644 index 0000000..6586c44 --- /dev/null +++ b/src/LinkTracker.Scrapper/Configuration/ScrapperOptions.cs @@ -0,0 +1,13 @@ +namespace LinkTracker.Scrapper.Configuration; + +public sealed class ScrapperOptions +{ + public const string SectionName = "Scrapper"; + + public int CheckIntervalSeconds { get; init; } = 30; + public int BatchSize { get; init; } = 100; + public int Parallelism { get; init; } = 4; + + public string GitHubBaseUrl { get; init; } = "https://api.github.com/"; + public string StackOverflowBaseUrl { get; init; } = "https://api.stackexchange.com/2.3/"; +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Jobs/LinkUpdaterJob.cs b/src/LinkTracker.Scrapper/Jobs/LinkUpdaterJob.cs index feea4c9..e707074 100644 --- a/src/LinkTracker.Scrapper/Jobs/LinkUpdaterJob.cs +++ b/src/LinkTracker.Scrapper/Jobs/LinkUpdaterJob.cs @@ -1,93 +1,18 @@ -using Quartz; -using LinkTracker.Scrapper.Repositories; -using System.Net.Http; -using LinkTracker.Scrapper.Clients; -using LinkTracker.Shared.Models; +using LinkTracker.Scrapper.Services.Updates; +using Quartz; namespace LinkTracker.Scrapper.Jobs; public class LinkUpdaterJob( - ILinkRepository repo, - IHttpClientFactory httpClientFactory, - GitHubClient github, - StackOverflowClient stackOverflow, + LinkUpdateProcessor processor, ILogger logger) : IJob { public async Task Execute(IJobExecutionContext context) { - var botClient = httpClientFactory.CreateClient("BotClient"); + logger.LogInformation("Link update check started"); - foreach (var (url, chatIds, lastUpdate) in repo.GetLinksForUpdate()) - { - if (chatIds.Length == 0) - { - continue; - } + await processor.ProcessAsync(context.CancellationToken); - DateTimeOffset? currentUpdateFromApi = null; - - try - { - if (url.Contains("github.com")) - { - var parts = url.TrimEnd('/').Split('/'); - if (parts.Length >= 2) - { - currentUpdateFromApi = await github.GetLastUpdate(parts[^2], parts[^1]); - } - } - else if (url.Contains("stackoverflow.com")) - { - var parts = url.Split('/'); - var questionsIndex = Array.IndexOf(parts, "questions"); - if (questionsIndex != -1 && parts.Length > questionsIndex + 1) - { - if (long.TryParse(parts[questionsIndex + 1], out var questionId)) - { - currentUpdateFromApi = await stackOverflow.GetLastUpdate(questionId); - } - } - } - - if (currentUpdateFromApi.HasValue) - { - if (currentUpdateFromApi > lastUpdate) - { - logger.LogInformation("Found a new update for {Url}. Was: {Old}, Now: {New}", url, lastUpdate, - currentUpdateFromApi); - var updateReq = new LinkUpdate( - Id: 0, - Url: url, - Description: - $"there is a new activity in repo or in question! (Date: {currentUpdateFromApi:g})", - TgChatIds: chatIds); - - var response = await botClient.PostAsJsonAsync("/updates", updateReq); - - if (response.IsSuccessStatusCode) - { - repo.UpdateLastCheckTime(url, currentUpdateFromApi.Value); - } - } - else - { - repo.UpdateLastCheckTime(url, DateTimeOffset.Now); - } - } - } - catch (Refit.ApiException ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) - { - logger.LogWarning("Link {Url} is dead (404). Cleaning up...", url); - foreach (var chatId in chatIds) - { - repo.RemoveLink(chatId, url); - } - } - catch (Exception ex) - { - logger.LogError(ex, "Transient error for {Url}. Skipping this tick.", url); - repo.UpdateLastCheckTime(url, DateTimeOffset.UtcNow); - } - } + logger.LogInformation("Link update check finished"); } } \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Program.cs b/src/LinkTracker.Scrapper/Program.cs index 17d9a0b..dc99152 100644 --- a/src/LinkTracker.Scrapper/Program.cs +++ b/src/LinkTracker.Scrapper/Program.cs @@ -1,26 +1,36 @@ -using Quartz; -using LinkTracker.Scrapper.Repositories; using LinkTracker.Scrapper.Clients; -using LinkTracker.Scrapper.Jobs; using LinkTracker.Scrapper.Configuration; using LinkTracker.Scrapper.Database; -using LinkTracker.Scrapper.Repositories.Sql; +using LinkTracker.Scrapper.Jobs; +using LinkTracker.Scrapper.Repositories; using LinkTracker.Scrapper.Repositories.Orm; +using LinkTracker.Scrapper.Repositories.Sql; +using LinkTracker.Scrapper.Services.Notifications; +using LinkTracker.Scrapper.Services.Updates; using Microsoft.EntityFrameworkCore; using Npgsql; +using Quartz; var builder = WebApplication.CreateBuilder(args); builder.Services.AddControllers(); builder.Services.AddEndpointsApiExplorer(); builder.Services.AddSwaggerGen(); + builder.Services.Configure( builder.Configuration.GetSection(DatabaseOptions.SectionName)); +builder.Services.Configure( + builder.Configuration.GetSection(ScrapperOptions.SectionName)); + var databaseOptions = builder.Configuration .GetSection(DatabaseOptions.SectionName) .Get() ?? new DatabaseOptions(); +var scrapperOptions = builder.Configuration + .GetSection(ScrapperOptions.SectionName) + .Get() ?? new ScrapperOptions(); + builder.Services.AddSingleton(_ => NpgsqlDataSource.Create(databaseOptions.ConnectionString)); builder.Services.AddDbContext(options => @@ -37,8 +47,22 @@ builder.Services.AddScoped(); } -builder.Services.AddHttpClient(); -builder.Services.AddHttpClient(); +builder.Services.AddHttpClient(client => +{ + client.BaseAddress = new Uri(scrapperOptions.GitHubBaseUrl); + client.DefaultRequestHeaders.UserAgent.ParseAdd("LinkTrackerBot/1.0"); +}); + +builder.Services.AddHttpClient(client => +{ + client.BaseAddress = new Uri(scrapperOptions.StackOverflowBaseUrl); +}); + +builder.Services.AddScoped(provider => + provider.GetRequiredService()); + +builder.Services.AddScoped(provider => + provider.GetRequiredService()); builder.Services.AddHttpClient("BotClient", client => { @@ -46,6 +70,9 @@ client.BaseAddress = new Uri(botUrl); }); +builder.Services.AddScoped(); +builder.Services.AddScoped(); + builder.Services.AddQuartz(q => { var jobKey = new JobKey("LinkUpdaterJob"); @@ -56,7 +83,7 @@ .ForJob(jobKey) .WithIdentity("LinkUpdaterJob-trigger") .WithSimpleSchedule(x => x - .WithIntervalInSeconds(30) + .WithIntervalInSeconds(Math.Max(1, scrapperOptions.CheckIntervalSeconds)) .RepeatForever())); }); @@ -74,4 +101,4 @@ app.MapControllers(); -app.Run(); +app.Run(); \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Notifications/HttpMessageSender.cs b/src/LinkTracker.Scrapper/Services/Notifications/HttpMessageSender.cs new file mode 100644 index 0000000..281d488 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Notifications/HttpMessageSender.cs @@ -0,0 +1,14 @@ +using LinkTracker.Shared.Models; + +namespace LinkTracker.Scrapper.Services.Notifications; + +public class HttpMessageSender(IHttpClientFactory httpClientFactory) : IMessageSender +{ + public async Task SendAsync(LinkUpdate update, CancellationToken cancellationToken) + { + var client = httpClientFactory.CreateClient("BotClient"); + + var response = await client.PostAsJsonAsync("/updates", update, cancellationToken); + response.EnsureSuccessStatusCode(); + } +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Notifications/IMessageSender.cs b/src/LinkTracker.Scrapper/Services/Notifications/IMessageSender.cs new file mode 100644 index 0000000..3cbee88 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Notifications/IMessageSender.cs @@ -0,0 +1,8 @@ +using LinkTracker.Shared.Models; + +namespace LinkTracker.Scrapper.Services.Notifications; + +public interface IMessageSender +{ + Task SendAsync(LinkUpdate update, CancellationToken cancellationToken); +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Updates/DetectedLinkUpdate.cs b/src/LinkTracker.Scrapper/Services/Updates/DetectedLinkUpdate.cs new file mode 100644 index 0000000..a9f2180 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Updates/DetectedLinkUpdate.cs @@ -0,0 +1,9 @@ +namespace LinkTracker.Scrapper.Services.Updates; + +public record DetectedLinkUpdate( + string Url, + string Kind, + string Title, + string UserName, + DateTimeOffset CreatedAt, + string Preview); \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Updates/ILinkUpdateChecker.cs b/src/LinkTracker.Scrapper/Services/Updates/ILinkUpdateChecker.cs new file mode 100644 index 0000000..c879c07 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Updates/ILinkUpdateChecker.cs @@ -0,0 +1,11 @@ +namespace LinkTracker.Scrapper.Services.Updates; + +public interface ILinkUpdateChecker +{ + bool CanHandle(string url); + + Task> CheckUpdatesAsync( + string url, + DateTimeOffset since, + CancellationToken cancellationToken); +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Updates/LinkUpdateProcessor.cs b/src/LinkTracker.Scrapper/Services/Updates/LinkUpdateProcessor.cs new file mode 100644 index 0000000..8d5aac7 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Updates/LinkUpdateProcessor.cs @@ -0,0 +1,153 @@ +using LinkTracker.Scrapper.Configuration; +using LinkTracker.Scrapper.Repositories; +using LinkTracker.Scrapper.Services.Notifications; +using LinkTracker.Shared.Models; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Options; + +namespace LinkTracker.Scrapper.Services.Updates; + +public class LinkUpdateProcessor( + IServiceScopeFactory scopeFactory, + IOptions options, + ILogger logger) +{ + public async Task ProcessAsync(CancellationToken cancellationToken) + { + var batchSize = Math.Clamp(options.Value.BatchSize, 50, 500); + var parallelism = Math.Max(1, options.Value.Parallelism); + + for (var offset = 0; !cancellationToken.IsCancellationRequested; offset += batchSize) + { + var batch = GetBatch(offset, batchSize); + + if (batch.Length == 0) + { + break; + } + + await ProcessBatchAsync(batch, parallelism, cancellationToken); + + if (batch.Length < batchSize) + { + break; + } + } + } + + private (string Url, long[] ChatIds, DateTimeOffset LastUpdate)[] GetBatch(int offset, int batchSize) + { + using var scope = scopeFactory.CreateScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + + return repository.GetLinksForUpdate(offset, batchSize).ToArray(); + } + + private async Task ProcessBatchAsync( + IEnumerable<(string Url, long[] ChatIds, DateTimeOffset LastUpdate)> batch, + int parallelism, + CancellationToken cancellationToken) + { + using var semaphore = new SemaphoreSlim(parallelism); + + var tasks = batch + .Select(link => ProcessLinkWithSemaphoreAsync(link, semaphore, cancellationToken)) + .ToArray(); + + await Task.WhenAll(tasks); + } + + private async Task ProcessLinkWithSemaphoreAsync( + (string Url, long[] ChatIds, DateTimeOffset LastUpdate) link, + SemaphoreSlim semaphore, + CancellationToken cancellationToken) + { + await semaphore.WaitAsync(cancellationToken); + + try + { + await ProcessLinkAsync(link.Url, link.ChatIds, link.LastUpdate, cancellationToken); + } + finally + { + semaphore.Release(); + } + } + + private async Task ProcessLinkAsync( + string url, + long[] chatIds, + DateTimeOffset lastUpdate, + CancellationToken cancellationToken) + { + try + { + using var scope = scopeFactory.CreateScope(); + var repository = scope.ServiceProvider.GetRequiredService(); + var checkers = scope.ServiceProvider.GetServices(); + var sender = scope.ServiceProvider.GetRequiredService(); + + var checker = checkers.FirstOrDefault(checker => checker.CanHandle(url)); + + if (checker is null) + { + logger.LogWarning("Unsupported link type: {Url}", url); + await SendFailureReport(sender, url, chatIds, "Unsupported link type", cancellationToken); + repository.UpdateLastCheckTime(url, DateTimeOffset.UtcNow); + return; + } + + var updates = await checker.CheckUpdatesAsync(url, lastUpdate, cancellationToken); + + foreach (var update in updates) + { + await sender.SendAsync(new LinkUpdate( + Id: 0, + Url: url, + Description: UpdateMessageFormatter.Format(update), + TgChatIds: chatIds), cancellationToken); + } + + repository.UpdateLastCheckTime(url, DateTimeOffset.UtcNow); + } + catch (Exception ex) + { + logger.LogError(ex, "Failed to process link {Url}", url); + + await TrySendFailureReport(url, chatIds, ex.Message, cancellationToken); + } + } + + private async Task TrySendFailureReport( + string url, + long[] chatIds, + string reason, + CancellationToken cancellationToken) + { + try + { + using var scope = scopeFactory.CreateScope(); + var sender = scope.ServiceProvider.GetRequiredService(); + + await SendFailureReport(sender, url, chatIds, reason, cancellationToken); + } + catch (Exception ex) + { + logger.LogError(ex, "Failed to send failure report for {Url}", url); + } + } + + private static Task SendFailureReport( + IMessageSender sender, + string url, + long[] chatIds, + string reason, + CancellationToken cancellationToken) + { + return sender.SendAsync(new LinkUpdate( + Id: 0, + Url: url, + Description: UpdateMessageFormatter.FormatFailure(url, reason), + TgChatIds: chatIds), cancellationToken); + } +} diff --git a/src/LinkTracker.Scrapper/Services/Updates/PreviewBuilder.cs b/src/LinkTracker.Scrapper/Services/Updates/PreviewBuilder.cs new file mode 100644 index 0000000..e50ad2a --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Updates/PreviewBuilder.cs @@ -0,0 +1,18 @@ +using System.Net; +using System.Text.RegularExpressions; + +namespace LinkTracker.Scrapper.Services.Updates; + +public static partial class PreviewBuilder +{ + public static string Build(string? text) + { + var decoded = WebUtility.HtmlDecode(text ?? string.Empty); + var plainText = HtmlTagRegex().Replace(decoded, string.Empty).Trim(); + + return plainText.Length <= 200 ? plainText : plainText[..200]; + } + + [GeneratedRegex("<.*?>")] + private static partial Regex HtmlTagRegex(); +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/Services/Updates/UpdateMessageFormatter.cs b/src/LinkTracker.Scrapper/Services/Updates/UpdateMessageFormatter.cs new file mode 100644 index 0000000..96c1890 --- /dev/null +++ b/src/LinkTracker.Scrapper/Services/Updates/UpdateMessageFormatter.cs @@ -0,0 +1,24 @@ +namespace LinkTracker.Scrapper.Services.Updates; + +public static class UpdateMessageFormatter +{ + public static string Format(DetectedLinkUpdate update) + { + return $""" + Type: {update.Kind} + Title: {update.Title} + User: {update.UserName} + Created at: {update.CreatedAt:u} + Preview: {update.Preview} + """; + } + + public static string FormatFailure(string url, string reason) + { + return $""" + Failed to check link. + Url: {url} + Reason: {reason} + """; + } +} \ No newline at end of file diff --git a/src/LinkTracker.Scrapper/appsettings.json b/src/LinkTracker.Scrapper/appsettings.json index 63e7dc9..08b02d0 100644 --- a/src/LinkTracker.Scrapper/appsettings.json +++ b/src/LinkTracker.Scrapper/appsettings.json @@ -10,5 +10,12 @@ "ConnectionString": "Host=localhost;Port=5433;Database=linktracker;Username=linktracker;Password=linktracker", "RunMigrations": true }, + "Scrapper": { + "CheckIntervalSeconds": 30, + "BatchSize": 100, + "Parallelism": 4, + "GitHubBaseUrl": "https://api.github.com/", + "StackOverflowBaseUrl": "https://api.stackexchange.com/2.3/" + }, "AllowedHosts": "*" } \ No newline at end of file diff --git a/tests/LinkTracker.Scrapper.Tests/LinkTracker.Scrapper.Tests.csproj b/tests/LinkTracker.Scrapper.Tests/LinkTracker.Scrapper.Tests.csproj new file mode 100644 index 0000000..ed868bc --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/LinkTracker.Scrapper.Tests.csproj @@ -0,0 +1,34 @@ + + + + net9.0 + enable + enable + false + + + + + + + + + + + + + + + + + + migrations\%(RecursiveDir)%(Filename)%(Extension) + PreserveNewest + + + + + + + + diff --git a/tests/LinkTracker.Scrapper.Tests/Postgres/MigrationTests.cs b/tests/LinkTracker.Scrapper.Tests/Postgres/MigrationTests.cs new file mode 100644 index 0000000..4c7eb5a --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Postgres/MigrationTests.cs @@ -0,0 +1,34 @@ +using Npgsql; + +namespace LinkTracker.Scrapper.Tests.Postgres; + +[Collection(PostgresCollection.Name)] +public class MigrationTests(PostgresFixture fixture) +{ + [Fact] + public async Task Migrations_CreateExpectedTables() + { + await using var connection = await fixture.DataSource.OpenConnectionAsync(); + await using var command = new NpgsqlCommand(""" + SELECT table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ORDER BY table_name; + """, connection); + + await using var reader = await command.ExecuteReaderAsync(); + var tables = new List(); + + while (await reader.ReadAsync()) + { + tables.Add(reader.GetString(0)); + } + + Assert.Contains("chats", tables); + Assert.Contains("links", tables); + Assert.Contains("chat_links", tables); + Assert.Contains("tags", tables); + Assert.Contains("chat_link_tags", tables); + Assert.Contains("schemaversions", tables); + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresCollection.cs b/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresCollection.cs new file mode 100644 index 0000000..9239c7b --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresCollection.cs @@ -0,0 +1,7 @@ +namespace LinkTracker.Scrapper.Tests.Postgres; + +[CollectionDefinition(Name)] +public sealed class PostgresCollection : ICollectionFixture +{ + public const string Name = "postgres"; +} diff --git a/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresFixture.cs b/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresFixture.cs new file mode 100644 index 0000000..d557289 --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Postgres/PostgresFixture.cs @@ -0,0 +1,76 @@ +using DbUp; +using LinkTracker.Scrapper.Database; +using Microsoft.EntityFrameworkCore; +using Npgsql; +using Testcontainers.PostgreSql; + +namespace LinkTracker.Scrapper.Tests.Postgres; + +public sealed class PostgresFixture : IAsyncLifetime +{ + private readonly PostgreSqlContainer _container = new PostgreSqlBuilder() + .WithImage("postgres:16") + .WithDatabase("linktracker_tests") + .WithUsername("linktracker") + .WithPassword("linktracker") + .Build(); + + public string ConnectionString => _container.GetConnectionString(); + + public NpgsqlDataSource DataSource { get; private set; } = null!; + + public async Task InitializeAsync() + { + await _container.StartAsync(); + DataSource = NpgsqlDataSource.Create(ConnectionString); + RunMigrations(); + } + + public async Task DisposeAsync() + { + await DataSource.DisposeAsync(); + await _container.DisposeAsync(); + } + + public LinkTrackerDbContext CreateDbContext() + { + var options = new DbContextOptionsBuilder() + .UseNpgsql(ConnectionString) + .Options; + + return new LinkTrackerDbContext(options); + } + + public async Task ResetDatabaseAsync() + { + await using var connection = await DataSource.OpenConnectionAsync(); + await using var command = new NpgsqlCommand(""" + TRUNCATE TABLE + chat_link_tags, + chat_links, + tags, + links, + chats + RESTART IDENTITY CASCADE; + """, connection); + + await command.ExecuteNonQueryAsync(); + } + + private void RunMigrations() + { + var migrationsPath = Path.Combine(AppContext.BaseDirectory, "migrations"); + + var result = DeployChanges.To + .PostgresqlDatabase(ConnectionString) + .WithScriptsFromFileSystem(migrationsPath) + .LogToConsole() + .Build() + .PerformUpgrade(); + + if (!result.Successful) + { + throw new InvalidOperationException("Failed to run test migrations.", result.Error); + } + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Postgres/RepositoryTests.cs b/tests/LinkTracker.Scrapper.Tests/Postgres/RepositoryTests.cs new file mode 100644 index 0000000..33893ec --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Postgres/RepositoryTests.cs @@ -0,0 +1,112 @@ +using LinkTracker.Scrapper.Database; +using LinkTracker.Scrapper.Repositories; +using LinkTracker.Scrapper.Repositories.Orm; +using LinkTracker.Scrapper.Repositories.Sql; + +namespace LinkTracker.Scrapper.Tests.Postgres; + +[Collection(PostgresCollection.Name)] +public class RepositoryTests(PostgresFixture fixture) +{ + [Theory] + [InlineData("SQL")] + [InlineData("ORM")] + public async Task ChatCrud_Works(string accessType) + { + await fixture.ResetDatabaseAsync(); + + using var context = CreateRepositoryContext(accessType); + + context.Links.AddChat(101); + + Assert.True(context.Links.ChatExists(101)); + + context.Links.RemoveChat(101); + + Assert.False(context.Links.ChatExists(101)); + } + + [Theory] + [InlineData("SQL")] + [InlineData("ORM")] + public async Task LinkSubscriptionCrud_Works(string accessType) + { + await fixture.ResetDatabaseAsync(); + + using var context = CreateRepositoryContext(accessType); + var url = $"https://github.com/dotnet/runtime-{Guid.NewGuid():N}"; + + context.Links.AddChat(202); + + var added = context.Links.AddLink(202, url, ["dotnet", "github"]); + var duplicate = context.Links.AddLink(202, url, ["dotnet"]); + var allLinks = context.Links.GetLinks(202).ToArray(); + var dotnetLinks = context.Links.GetLinks(202, "dotnet").ToArray(); + var missingTagLinks = context.Links.GetLinks(202, "missing").ToArray(); + var updateBatch = context.Links.GetLinksForUpdate().ToArray(); + var removed = context.Links.RemoveLink(202, url); + + Assert.NotNull(added); + Assert.Null(duplicate); + Assert.Single(allLinks); + Assert.Single(dotnetLinks); + Assert.Empty(missingTagLinks); + Assert.Contains(updateBatch, item => item.Url == url && item.ChatIds.Contains(202)); + Assert.True(removed); + Assert.Empty(context.Links.GetLinks(202)); + } + + [Theory] + [InlineData("SQL")] + [InlineData("ORM")] + public async Task TagCrud_Works(string accessType) + { + await fixture.ResetDatabaseAsync(); + + using var context = CreateRepositoryContext(accessType); + + var created = context.Tags.Create("backend"); + var duplicate = context.Tags.Create("backend"); + var all = context.Tags.GetAll().ToArray(); + var updated = context.Tags.Update(created.Id, "backend-updated"); + var deleted = context.Tags.Delete(created.Id); + + Assert.Equal(created.Id, duplicate.Id); + Assert.Single(all); + Assert.Equal("backend-updated", updated?.Name); + Assert.True(deleted); + Assert.Null(context.Tags.Get(created.Id)); + } + + private RepositoryContext CreateRepositoryContext(string accessType) + { + if (accessType.Equals("ORM", StringComparison.OrdinalIgnoreCase)) + { + var dbContext = fixture.CreateDbContext(); + + return new RepositoryContext( + new OrmLinkRepository(dbContext), + new OrmTagRepository(dbContext), + dbContext); + } + + return new RepositoryContext( + new SqlLinkRepository(fixture.DataSource), + new SqlTagRepository(fixture.DataSource)); + } + + private sealed class RepositoryContext( + ILinkRepository links, + ITagRepository tags, + LinkTrackerDbContext? dbContext = null) : IDisposable + { + public ILinkRepository Links { get; } = links; + + public ITagRepository Tags { get; } = tags; + + public void Dispose() + { + dbContext?.Dispose(); + } + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Updates/GitHubClientTests.cs b/tests/LinkTracker.Scrapper.Tests/Updates/GitHubClientTests.cs new file mode 100644 index 0000000..a678e75 --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Updates/GitHubClientTests.cs @@ -0,0 +1,83 @@ +using LinkTracker.Scrapper.Clients; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace LinkTracker.Scrapper.Tests.Updates; + +public class GitHubClientTests : IDisposable +{ + private readonly WireMockServer _server = WireMockServer.Start(); + + [Fact] + public async Task CheckUpdatesAsync_ReturnsNewIssueAndPullRequestWithRequiredFields() + { + _server + .Given(Request.Create().WithPath("/repos/octo/demo/issues").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody($$""" + [ + { + "html_url": "https://github.com/octo/demo/issues/1", + "title": "New issue", + "created_at": "2026-01-02T12:00:00Z", + "body": "{{new string('i', 250)}}", + "user": { "login": "issue-author" } + }, + { + "html_url": "https://github.com/octo/demo/pull/2", + "title": "PR hidden in issues endpoint", + "created_at": "2026-01-02T13:00:00Z", + "body": "skip", + "user": { "login": "pull-author" }, + "pull_request": {} + } + ] + """)); + + _server + .Given(Request.Create().WithPath("/repos/octo/demo/pulls").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody(""" + [ + { + "html_url": "https://github.com/octo/demo/pull/2", + "title": "New PR", + "created_at": "2026-01-03T12:00:00Z", + "body": "Pull request description", + "user": { "login": "pull-author" } + } + ] + """)); + + using var httpClient = new HttpClient { BaseAddress = new Uri(_server.Url!) }; + var client = new GitHubClient(httpClient); + + var updates = await client.CheckUpdatesAsync( + "https://github.com/octo/demo", + DateTimeOffset.Parse("2026-01-01T00:00:00Z"), + CancellationToken.None); + + Assert.Equal(2, updates.Count); + + var issue = Assert.Single(updates, update => update.Kind == "Issue"); + Assert.Equal("New issue", issue.Title); + Assert.Equal("issue-author", issue.UserName); + Assert.Equal(200, issue.Preview.Length); + + var pullRequest = Assert.Single(updates, update => update.Kind == "Pull request"); + Assert.Equal("New PR", pullRequest.Title); + Assert.Equal("pull-author", pullRequest.UserName); + Assert.Contains("Pull request description", pullRequest.Preview); + } + + public void Dispose() + { + _server.Stop(); + _server.Dispose(); + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Updates/LinkUpdateProcessorTests.cs b/tests/LinkTracker.Scrapper.Tests/Updates/LinkUpdateProcessorTests.cs new file mode 100644 index 0000000..6245564 --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Updates/LinkUpdateProcessorTests.cs @@ -0,0 +1,199 @@ +using LinkTracker.Scrapper.Configuration; +using LinkTracker.Scrapper.Repositories; +using LinkTracker.Scrapper.Services.Notifications; +using LinkTracker.Scrapper.Services.Updates; +using LinkTracker.Shared.Models; +using System.Collections.Concurrent; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; + +namespace LinkTracker.Scrapper.Tests.Updates; + +public class LinkUpdateProcessorTests +{ + [Fact] + public async Task ProcessAsync_UsesConfiguredBatchAndProcessesLinksInParallel() + { + var links = new[] + { + Link("https://github.com/octo/one"), + Link("https://github.com/octo/two"), + Link("https://github.com/octo/three") + }; + var repository = new FakeLinkRepository(links); + var checker = new FakeChecker(delay: TimeSpan.FromMilliseconds(50)); + var sender = new CapturingMessageSender(); + var processor = CreateProcessor(repository, checker, sender, batchSize: 10, parallelism: 2); + + await processor.ProcessAsync(CancellationToken.None); + + Assert.Equal((0, 50), repository.BatchRequests.Single()); + Assert.Equal(3, sender.Sent.Count); + Assert.Equal(3, repository.UpdatedUrls.Count); + Assert.InRange(checker.MaxConcurrentCalls, 2, 2); + } + + [Fact] + public async Task ProcessAsync_IsolatesLinkFailuresAndSendsFailureReport() + { + var links = new[] + { + Link("https://github.com/octo/good"), + Link("https://github.com/octo/bad") + }; + var repository = new FakeLinkRepository(links); + var checker = new FakeChecker(url => url.Contains("/bad", StringComparison.OrdinalIgnoreCase)); + var sender = new CapturingMessageSender(); + var processor = CreateProcessor(repository, checker, sender, batchSize: 50, parallelism: 2); + + await processor.ProcessAsync(CancellationToken.None); + + Assert.Contains(sender.Sent, update => update.Url.EndsWith("/good") && update.Description.Contains("Title:")); + Assert.Contains(sender.Sent, update => update.Url.EndsWith("/bad") && update.Description.Contains("Failed to check link.")); + Assert.Contains("https://github.com/octo/good", repository.UpdatedUrls); + Assert.DoesNotContain("https://github.com/octo/bad", repository.UpdatedUrls); + } + + private static LinkUpdateProcessor CreateProcessor( + FakeLinkRepository repository, + FakeChecker checker, + CapturingMessageSender sender, + int batchSize, + int parallelism) + { + var services = new ServiceCollection(); + services.AddSingleton(repository); + services.AddSingleton(checker); + services.AddSingleton(sender); + + var provider = services.BuildServiceProvider(); + + return new LinkUpdateProcessor( + provider.GetRequiredService(), + Options.Create(new ScrapperOptions + { + BatchSize = batchSize, + Parallelism = parallelism + }), + NullLogger.Instance); + } + + private static (string Url, long[] ChatIds, DateTimeOffset LastUpdate) Link(string url) + { + return (url, [1001], DateTimeOffset.Parse("2026-01-01T00:00:00Z")); + } + + private sealed class FakeLinkRepository( + IReadOnlyList<(string Url, long[] ChatIds, DateTimeOffset LastUpdate)> links) : ILinkRepository + { + public List<(int Offset, int Limit)> BatchRequests { get; } = []; + + public ConcurrentBag UpdatedUrls { get; } = []; + + public void AddChat(long chatId) => throw new NotSupportedException(); + + public void RemoveChat(long chatId) => throw new NotSupportedException(); + + public bool ChatExists(long chatId) => throw new NotSupportedException(); + + public LinkResponse? AddLink(long chatId, string url, string[]? tags) => throw new NotSupportedException(); + + public bool RemoveLink(long chatId, string url) => throw new NotSupportedException(); + + public IEnumerable GetLinks(long chatId, string? tag = null, int offset = 0, int limit = 100) => + throw new NotSupportedException(); + + public IEnumerable<(string Url, long[] ChatIds, DateTimeOffset LastUpdate)> GetLinksForUpdate( + int offset = 0, + int limit = 100) + { + BatchRequests.Add((offset, limit)); + + return links + .Skip(offset) + .Take(limit) + .ToArray(); + } + + public void UpdateLastCheckTime(string url, DateTimeOffset lastUpdate) + { + UpdatedUrls.Add(url); + } + } + + private sealed class FakeChecker : ILinkUpdateChecker + { + private readonly Func _shouldFail; + private readonly TimeSpan _delay; + private int _currentCalls; + + public FakeChecker(TimeSpan delay) + : this(_ => false, delay) + { + } + + public FakeChecker(Func shouldFail) + : this(shouldFail, TimeSpan.Zero) + { + } + + private FakeChecker(Func shouldFail, TimeSpan delay) + { + _shouldFail = shouldFail; + _delay = delay; + } + + public int MaxConcurrentCalls { get; private set; } + + public bool CanHandle(string url) => true; + + public async Task> CheckUpdatesAsync( + string url, + DateTimeOffset since, + CancellationToken cancellationToken) + { + var current = Interlocked.Increment(ref _currentCalls); + MaxConcurrentCalls = Math.Max(MaxConcurrentCalls, current); + + try + { + if (_delay > TimeSpan.Zero) + { + await Task.Delay(_delay, cancellationToken); + } + + if (_shouldFail(url)) + { + throw new HttpRequestException("External API unavailable"); + } + + return + [ + new DetectedLinkUpdate( + Url: url, + Kind: "Issue", + Title: "Updated link", + UserName: "user", + CreatedAt: DateTimeOffset.UtcNow, + Preview: "preview") + ]; + } + finally + { + Interlocked.Decrement(ref _currentCalls); + } + } + } + + private sealed class CapturingMessageSender : IMessageSender + { + public ConcurrentBag Sent { get; } = []; + + public Task SendAsync(LinkUpdate update, CancellationToken cancellationToken) + { + Sent.Add(update); + return Task.CompletedTask; + } + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Updates/PreviewBuilderTests.cs b/tests/LinkTracker.Scrapper.Tests/Updates/PreviewBuilderTests.cs new file mode 100644 index 0000000..bb5562d --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Updates/PreviewBuilderTests.cs @@ -0,0 +1,18 @@ +using LinkTracker.Scrapper.Services.Updates; + +namespace LinkTracker.Scrapper.Tests.Updates; + +public class PreviewBuilderTests +{ + [Fact] + public void Build_StripsHtmlDecodesEntitiesAndLimitsTo200Characters() + { + var text = $"

{new string('a', 210)}&

"; + + var preview = PreviewBuilder.Build(text); + + Assert.Equal(200, preview.Length); + Assert.DoesNotContain("

", preview); + Assert.DoesNotContain("&", preview); + } +} diff --git a/tests/LinkTracker.Scrapper.Tests/Updates/StackOverflowClientTests.cs b/tests/LinkTracker.Scrapper.Tests/Updates/StackOverflowClientTests.cs new file mode 100644 index 0000000..6ad62ed --- /dev/null +++ b/tests/LinkTracker.Scrapper.Tests/Updates/StackOverflowClientTests.cs @@ -0,0 +1,100 @@ +using LinkTracker.Scrapper.Clients; +using WireMock.RequestBuilders; +using WireMock.ResponseBuilders; +using WireMock.Server; + +namespace LinkTracker.Scrapper.Tests.Updates; + +public class StackOverflowClientTests : IDisposable +{ + private readonly WireMockServer _server = WireMockServer.Start(); + + [Fact] + public async Task CheckUpdatesAsync_ReturnsAnswersQuestionCommentsAndAnswerComments() + { + _server + .Given(Request.Create().WithPath("/questions/123").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody(""" + { + "items": [ + { "title": "How to test StackOverflow updates?" } + ] + } + """)); + + _server + .Given(Request.Create().WithPath("/questions/123/answers").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody(""" + { + "items": [ + { + "answer_id": 777, + "creation_date": 1767268800, + "body": "

Answer body

", + "owner": { "display_name": "answer-user" } + } + ] + } + """)); + + _server + .Given(Request.Create().WithPath("/questions/123/comments").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody(""" + { + "items": [ + { + "creation_date": 1767272400, + "body": "

Question comment

", + "owner": { "display_name": "question-comment-user" } + } + ] + } + """)); + + _server + .Given(Request.Create().WithPath("/answers/777/comments").UsingGet()) + .RespondWith(Response.Create() + .WithStatusCode(200) + .WithHeader("Content-Type", "application/json") + .WithBody(""" + { + "items": [ + { + "creation_date": 1767276000, + "body": "

Answer comment

", + "owner": { "display_name": "answer-comment-user" } + } + ] + } + """)); + + using var httpClient = new HttpClient { BaseAddress = new Uri(_server.Url!) }; + var client = new StackOverflowClient(httpClient); + + var updates = await client.CheckUpdatesAsync( + "https://stackoverflow.com/questions/123/how-to-test", + DateTimeOffset.FromUnixTimeSeconds(1767260000), + CancellationToken.None); + + Assert.Equal(3, updates.Count); + Assert.Contains(updates, update => update.Kind == "Answer" && update.UserName == "answer-user"); + Assert.Contains(updates, update => update.Kind == "Question comment" && update.Preview == "Question comment"); + Assert.Contains(updates, update => update.Kind == "Answer comment" && update.UserName == "answer-comment-user"); + Assert.All(updates, update => Assert.Equal("How to test StackOverflow updates?", update.Title)); + } + + public void Dispose() + { + _server.Stop(); + _server.Dispose(); + } +}