diff --git a/.github/workflows/daily-scraper-go.yml b/.github/workflows/daily-scraper-go.yml new file mode 100644 index 0000000..a67e748 --- /dev/null +++ b/.github/workflows/daily-scraper-go.yml @@ -0,0 +1,34 @@ +name: Daily Go Scraper + +on: + schedule: + - cron: '30 4 * * *' # 4:30 AM UTC, runs shortly after the Python version + workflow_dispatch: # Allow manual runs + +jobs: + scrape: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: 'stable' + cache-dependency-path: scraper_go/go.sum + + - name: Run Scraper + env: + TMDB_API_KEY: ${{ secrets.TMDB_API_KEY }} + run: | + cd scraper_go + go run cmd/scraper/main.go + + - name: Commit and push changes + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add frontend/public/data_go.json + git diff --quiet && git diff --staged --quiet || git commit -m "chore: daily showtime update (Go)" + git push diff --git a/.gitignore b/.gitignore index e9d3f01..fbf0a3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -letterboxd/ +/letterboxd/ .env .worktrees/ \ No newline at end of file diff --git a/conductor/tech-stack.md b/conductor/tech-stack.md index 46fbec1..1131172 100644 --- a/conductor/tech-stack.md +++ b/conductor/tech-stack.md @@ -11,12 +11,13 @@ kinꚘbok uses a decoupled architecture with a statically hosted frontend that c - **Testing:** Vitest # Backend (Scraper) -- **Language:** Python 3.11+ -- **HTTP Client:** HTTPX -- **HTML Parsing:** BeautifulSoup4 -- **Data Validation & Modeling:** Pydantic -- **String Matching:** RapidFuzz -- **Testing:** Pytest +- **Language:** Go 1.25+ & Python 3.11+ (Running in parallel during migration) +- **Framework (Go):** Colly/v2 (for web scraping) +- **HTTP Client:** HTTPX (Python) / net/http (Go) +- **HTML Parsing:** BeautifulSoup4 (Python) / Goquery via Colly (Go) +- **Data Validation & Modeling:** Pydantic (Python) / Custom Go schemas with validation +- **String Matching & Normalization:** RapidFuzz (Python) / Custom slug-matching & GenerateSlug (Go) +- **Testing:** Pytest (Python) / Go testing toolchain (Go) # CI/CD & Deployment - **Automation:** GitHub Actions (daily scraper runs, formatting checks, deployment) diff --git a/conductor/tracks.md b/conductor/tracks.md index 16f46c1..cb238de 100644 --- a/conductor/tracks.md +++ b/conductor/tracks.md @@ -11,3 +11,8 @@ This file tracks all major tracks for the project. Each track has its own detail - [x] **Track: UX revamp when user clicks cinema map points and typeaheads for search bar** *Link: [./tracks/map_search_ux_20260619/](./tracks/map_search_ux_20260619/)* + +--- + +- [x] **Track: Implement the kinobok scraper in Golang using Colly with Goroutines/Channels for concurrency, running in parallel with the Python scraper.** +*Link: [./tracks/golang_scraper_20260702/](./tracks/golang_scraper_20260702/)* diff --git a/conductor/tracks/golang_scraper_20260702/index.md b/conductor/tracks/golang_scraper_20260702/index.md new file mode 100644 index 0000000..75d6de7 --- /dev/null +++ b/conductor/tracks/golang_scraper_20260702/index.md @@ -0,0 +1,5 @@ +# Track golang_scraper_20260702 Context + +- [Specification](./spec.md) +- [Implementation Plan](./plan.md) +- [Metadata](./metadata.json) \ No newline at end of file diff --git a/conductor/tracks/golang_scraper_20260702/metadata.json b/conductor/tracks/golang_scraper_20260702/metadata.json new file mode 100644 index 0000000..4d883c4 --- /dev/null +++ b/conductor/tracks/golang_scraper_20260702/metadata.json @@ -0,0 +1,8 @@ +{ + "track_id": "golang_scraper_20260702", + "type": "feature", + "status": "new", + "created_at": "2026-07-02T00:00:00Z", + "updated_at": "2026-07-02T00:00:00Z", + "description": "Implement the kinobok scraper in Golang using Colly with Goroutines/Channels for concurrency, running in parallel with the Python scraper." +} \ No newline at end of file diff --git a/conductor/tracks/golang_scraper_20260702/plan.md b/conductor/tracks/golang_scraper_20260702/plan.md new file mode 100644 index 0000000..d9a6d27 --- /dev/null +++ b/conductor/tracks/golang_scraper_20260702/plan.md @@ -0,0 +1,34 @@ +# Implementation Plan: Golang Scraper + +## Phase 1: Filmweb Scraper (Concurrent) +- [x] Task: Filmweb Models and Colly Setup + - [x] Write Tests (Red Phase): Define mock server responses and test basic Colly initialization. + - [x] Implement (Green Phase): Configure Colly collector and set up Goroutine/Channel architecture. +- [x] Task: Filmweb Parsing Logic + - [x] Write Tests (Red Phase): Test parsing logic for extracting titles, cinemas, and showtimes from mock HTML. + - [x] Implement (Green Phase): Implement Colly callbacks, parse HTML, and feed results through channels. +- [x] Task: Conductor - User Manual Verification 'Phase 1: Filmweb Scraper (Concurrent)' (Protocol in workflow.md) + +## Phase 2: Letterboxd and TMDB Integrations +- [x] Task: TMDB API Integration + - [x] Write Tests (Red Phase): Test concurrent fetching of metadata and posters using a mock HTTP client. + - [x] Implement (Green Phase): Write concurrent HTTP requests to TMDB and merge with movie data. +- [x] Task: Letterboxd Integration + - [x] Write Tests (Red Phase): Test extraction/parsing of Letterboxd watchlists. + - [x] Implement (Green Phase): Build Letterboxd scraping/parsing logic. +- [x] Task: Conductor - User Manual Verification 'Phase 2: Letterboxd and TMDB Integrations' (Protocol in workflow.md) + +## Phase 3: Data Aggregation & Export +- [x] Task: Orchestration in Main + - [x] Write Tests (Red Phase): Test the synchronization and merging of data from Filmweb, TMDB, and Letterboxd. + - [x] Implement (Green Phase): Coordinate channels and Goroutines in the main entrypoint (`cmd/scraper/main.go`). +- [x] Task: Strict Parity JSON Export + - [x] Write Tests (Red Phase): Assert that the generated `data_go.json` strictly adheres to the existing Next.js frontend schema. + - [x] Implement (Green Phase): Write the final JSON export logic in `internal/export`. +- [x] Task: Conductor - User Manual Verification 'Phase 3: Data Aggregation & Export' (Protocol in workflow.md) + +## Phase 4: CI/CD Integration +- [x] Task: GitHub Actions Updates + - [x] Write Tests (Red Phase): (Skip logic tests, test via dry-run or local action simulator if possible). + - [x] Implement (Green Phase): Configure `daily-scraper-go.yml` to execute the Go scraper concurrently with Python and upload `data_go.json` as an artifact or commit it. +- [x] Task: Conductor - User Manual Verification 'Phase 4: CI/CD Integration' (Protocol in workflow.md) \ No newline at end of file diff --git a/conductor/tracks/golang_scraper_20260702/spec.md b/conductor/tracks/golang_scraper_20260702/spec.md new file mode 100644 index 0000000..e7618ae --- /dev/null +++ b/conductor/tracks/golang_scraper_20260702/spec.md @@ -0,0 +1,29 @@ +# Specification: Golang Scraper Implementation + +## Overview +This track focuses on implementing the backend scraper for kinobok in Golang using the Colly framework, intended to eventually replace the existing Python scraper. The initial deployment will run in parallel with the Python scraper to ensure data parity before a full transition. + +## Functional Requirements +1. **Filmweb Scraper:** Implement the logic to scrape movie showtimes and cinema details from Filmweb using the Colly framework. +2. **Letterboxd Scraper:** Implement the logic to process Letterboxd watchlists/data. +3. **TMDB Scraper:** Implement integration with the TMDB API/scraper to fetch posters and metadata for movies. +4. **Data Export:** Generate the final JSON file (`data_go.json`) containing all parsed and matched data. + +## Non-Functional Requirements +1. **Strict Parity:** The exported JSON file (`data_go.json`) MUST perfectly match the schema of the current Python scraper's `data.json` to ensure Next.js frontend compatibility. +2. **Parallel Execution:** The new Golang scraper must be integrated into the existing CI/CD (GitHub Actions) to run alongside the Python scraper, outputting to a separate file (`data_go.json`) without breaking the current production build. +3. **Language/Framework:** Use Golang and the Colly web scraping framework as defined in the `scraper_go` directory. +4. **Concurrency:** Heavily utilize Goroutines and Channels within the scraping process to maximize throughput and enhance the overall execution speed. + +## Acceptance Criteria +- [ ] `FilmwebScraper` correctly parses cinemas, times, and movie titles utilizing concurrent processing. +- [ ] `Letterboxd` integration correctly extracts watchlist data. +- [ ] `TMDB` integration accurately fetches required movie metadata concurrently. +- [ ] `export` package correctly generates `data_go.json` with strict schema parity. +- [ ] Concurrency patterns (Goroutines/Channels) are demonstrably used for performance. +- [ ] GitHub Actions workflow is updated to run the Golang scraper and output `data_go.json` alongside `data.json`. +- [ ] The Next.js frontend can flawlessly consume `data_go.json` if swapped (to be tested locally or manually). + +## Out of Scope +- Modifying the frontend application code to permanently switch to `data_go.json`. +- Removing or disabling the Python scraper. \ No newline at end of file diff --git a/scraper_go/cmd/scraper/main.go b/scraper_go/cmd/scraper/main.go new file mode 100644 index 0000000..e9b9ae2 --- /dev/null +++ b/scraper_go/cmd/scraper/main.go @@ -0,0 +1,478 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "log" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "time" + + "cmd/scraper/main.go/internal/export" + "cmd/scraper/main.go/internal/filmweb" + "cmd/scraper/main.go/internal/letterboxd" + "cmd/scraper/main.go/internal/slug" + "cmd/scraper/main.go/internal/tmdb" +) + +type stringSlice []string + +func (s *stringSlice) String() string { + return strings.Join(*s, ",") +} + +func (s *stringSlice) Set(value string) error { + *s = append(*s, value) + return nil +} + +type intSlice []int + +func (i *intSlice) String() string { + var s []string + for _, v := range *i { + s = append(s, strconv.Itoa(v)) + } + return strings.Join(s, ",") +} + +func (i *intSlice) Set(value string) error { + val, err := strconv.Atoi(value) + if err != nil { + return err + } + *i = append(*i, val) + return nil +} + +func main() { + var cities stringSlice + var days intSlice + limitMovies := flag.Int("limit-movies", 0, "Limit number of movies scraped per city") + flag.Var(&cities, "cities", "Cities to scrape (can be specified multiple times)") + flag.Var(&days, "days", "Days offset to scrape (can be specified multiple times)") + flag.Parse() + + if len(cities) == 0 { + cities = []string{ + "Warszawa", "Wrocław", "Poznań", "Kraków", "Gdańsk", + "Szczecin", "Łódź", "Toruń", "Katowice", "Lublin", + "Olsztyn", "Częstochowa", "Kalisz", "Białystok", + "Kielce", "Zielona Góra", "Opole", "Rzeszów", "Radom", + "Siedlce", "Skierniewice", "Płock", "Łomża", "Koszalin", + } + } + + log.Printf("🚀 Starting kinꚘbok Go Daily Scraper for %d cities...\n", len(cities)) + + tmdbAPIKey := os.Getenv("TMDB_API_KEY") + if tmdbAPIKey == "" { + log.Fatal("❌ Error: TMDB_API_KEY environment variable is not set.") + } + + filmwebScraper := filmweb.NewFilmwebScraper() + letterboxdScraper := letterboxd.NewLetterboxdScraper() + tmdbApi := tmdb.NewTMDBApi(tmdbAPIKey) + + outputPath := filepath.Join("frontend", "public", "data_go.json") + + // Load existing data to support sliding window and ID consistency + existingData := export.ExportSchema{ + Showtimes: make(map[string][]export.ShowtimeModel), + } + if _, err := os.Stat(outputPath); err == nil { + data, err := os.ReadFile(outputPath) + if err == nil { + if err := json.Unmarshal(data, &existingData); err != nil { + log.Printf("⚠️ Could not load existing data: %v. Starting fresh.\n", err) + } + } + } + + today := time.Now().Format("2006-01-02") + weekday := time.Now().Weekday() + isRefreshDay := weekday == time.Wednesday || weekday == time.Thursday || weekday == time.Friday + + var daysToScrape []int + var finalShowtimes = make(map[string][]export.ShowtimeModel) + + wereShowtimesPopulated := len(existingData.Showtimes) >= 7 + + if len(days) > 0 { + log.Printf("📅 Scraping user-specified days offsets: %v...\n", days) + daysToScrape = days + } else if isRefreshDay || !wereShowtimesPopulated { + log.Println("📅 Full refresh day (Wednesday, Thursday, Friday or first run). Scraping 7 days...") + for i := 0; i < 7; i++ { + daysToScrape = append(daysToScrape, i) + } + } else { + log.Println("📅 Incremental update day. Scraping day offset 6...") + daysToScrape = []int{6} + // Keep current and future days + for dateStr, stList := range existingData.Showtimes { + if dateStr >= today { + finalShowtimes[dateStr] = stList + } + } + } + + moviesMap := make(map[string]export.MovieModel) + for _, m := range existingData.Movies { + moviesMap[m.BoxdURI] = m + } + + cinemasMap := make(map[string]export.CinemaModel) + for _, c := range existingData.Cinemas { + cinemasMap[c.Name] = c + } + + // Helper to find starting ID counters + getMaxID := func(items []string, prefix string) int { + maxVal := 0 + for _, id := range items { + if strings.HasPrefix(id, prefix) { + if val, err := strconv.Atoi(id[len(prefix):]); err == nil && val > maxVal { + maxVal = val + } + } + } + return maxVal + } + + var movieIDs []string + for _, m := range moviesMap { + movieIDs = append(movieIDs, m.ID) + } + var cinemaIDs []string + for _, c := range cinemasMap { + cinemaIDs = append(cinemaIDs, c.ID) + } + + movieIDCounter := getMaxID(movieIDs, "m") + 1 + cinemaIDCounter := getMaxID(cinemaIDs, "c") + 1 + + var movieMutex sync.Mutex + var cinemaMutex sync.Mutex + var failures []export.FailureModel + var failuresMutex sync.Mutex + + // Cache to avoid double fetching the same movie metadata during parallel runs + type movieResolution struct { + boxdURI string + movie export.MovieModel + err error + } + resolvedCache := make(map[string]*movieResolution) + var cacheMutex sync.Mutex + + // We'll scrape days sequentially, but inside each day scrape we process cities concurrently, + // and fetch TMDB/Letterboxd concurrently using a worker pool. + for _, dayOffset := range daysToScrape { + log.Printf("📡 Scraping day offset %d...\n", dayOffset) + + // 1. Scrape all cities concurrently for this day offset + type cityResult struct { + city string + result *filmweb.FilmwebResult + err error + } + cityChan := make(chan cityResult, len(cities)) + var wg sync.WaitGroup + + for _, city := range cities { + wg.Add(1) + go func(c string) { + defer wg.Done() + res, err := filmwebScraper.Scrape(c, dayOffset, *limitMovies) + cityChan <- cityResult{city: c, result: res, err: err} + }(city) + } + + wg.Wait() + close(cityChan) + + // Collect results from city scraping + var pageDate string + var rawMoviesByCity = make(map[string][]*filmweb.FilmwebMovie) + for res := range cityChan { + if res.err != nil { + log.Printf("❌ Error scraping city %s for offset %d: %v\n", res.city, dayOffset, res.err) + continue + } + if res.result != nil { + if pageDate == "" { + pageDate = res.result.Date + } + rawMoviesByCity[res.city] = res.result.Movies + log.Printf("✅ Found %d movies for %s on %s.\n", len(res.result.Movies), res.city, res.result.Date) + } + } + + if pageDate == "" { + log.Printf("⚠️ No date resolved for offset %d. Skipping.\n", dayOffset) + continue + } + + // 2. Resolve TMDB and Letterboxd concurrently for all movie showtimes in this day + type resolveJob struct { + city string + fwMovie *filmweb.FilmwebMovie + } + + jobs := make(chan resolveJob, 200) + var dayShowtimes []export.ShowtimeModel + var showtimesMutex sync.Mutex + + workerWg := sync.WaitGroup{} + numWorkers := 8 + + for w := 0; w < numWorkers; w++ { + workerWg.Add(1) + go func() { + defer workerWg.Done() + for job := range jobs { + fwMovie := job.fwMovie + city := job.city + title := fwMovie.Title + + // Generate cache key + cacheKey := fmt.Sprintf("%s_%d", strings.ToLower(title), fwMovie.Year) + + // Check Cache first + cacheMutex.Lock() + cached, found := resolvedCache[cacheKey] + if !found { + // Add negative placeholder to prevent duplicate concurrent hits while fetching + resolvedCache[cacheKey] = nil + cacheMutex.Unlock() + + // Fetch metadata + log.Printf("🎬 Fetching metadata for: %s (Year: %d)...", title, fwMovie.Year) + + var tmdbMovie *tmdb.TMDBMovie + var err error + + // Try original title first + searchTitle := fwMovie.OriginalTitle + if searchTitle == "" { + searchTitle = title + } + + tmdbMovie, err = tmdbApi.SearchMovie(searchTitle, fwMovie.Year) + if err != nil || tmdbMovie == nil { + // Try with polish title if different + if fwMovie.OriginalTitle != "" && fwMovie.OriginalTitle != title { + tmdbMovie, err = tmdbApi.SearchMovie(title, fwMovie.Year) + } + } + + if err != nil || tmdbMovie == nil { + failuresMutex.Lock() + failures = append(failures, export.FailureModel{ + Title: title, + Reason: "TMDB search failed", + Details: func() *string { + s := fmt.Sprintf("No matches found for '%s' (year: %d)", title, fwMovie.Year) + return &s + }(), + }) + failuresMutex.Unlock() + + cacheMutex.Lock() + resolvedCache[cacheKey] = &movieResolution{err: fmt.Errorf("TMDB search failed")} + cacheMutex.Unlock() + continue + } + + // Generate Letterboxd slug + boxdSlug := slug.GenerateSlug(tmdbMovie.Title, tmdbMovie.Year) + boxdURI, err := letterboxdScraper.GetShortURI(boxdSlug) + if err != nil { + failuresMutex.Lock() + failures = append(failures, export.FailureModel{ + Title: title, + Reason: "Letterboxd URI resolution failed", + Details: func() *string { + s := fmt.Sprintf("Slug: %s, Error: %v", boxdSlug, err) + return &s + }(), + }) + failuresMutex.Unlock() + + cacheMutex.Lock() + resolvedCache[cacheKey] = &movieResolution{err: fmt.Errorf("Letterboxd short link failed: %w", err)} + cacheMutex.Unlock() + continue + } + + movieMutex.Lock() + movieModel, exists := moviesMap[boxdURI] + if !exists { + mid := fmt.Sprintf("m%d", movieIDCounter) + movieIDCounter++ + movieModel = export.MovieModel{ + ID: mid, + Title: tmdbMovie.Title, + BoxdURI: boxdURI, + } + if tmdbMovie.PosterPath != "" { + posterURL := fmt.Sprintf("https://image.tmdb.org/t/p/w500%s", tmdbMovie.PosterPath) + movieModel.Poster = &posterURL + } + moviesMap[boxdURI] = movieModel + } + movieMutex.Unlock() + + cachedVal := &movieResolution{ + boxdURI: boxdURI, + movie: movieModel, + } + cacheMutex.Lock() + resolvedCache[cacheKey] = cachedVal + cacheMutex.Unlock() + cached = cachedVal + } else { + cacheMutex.Unlock() + // Wait if it was started concurrently but not finished yet + for { + cacheMutex.Lock() + cached = resolvedCache[cacheKey] + cacheMutex.Unlock() + if cached != nil { + break + } + time.Sleep(50 * time.Millisecond) + } + } + + if cached.err != nil { + continue + } + + mid := cached.movie.ID + + // Process cinemas and showtimes + for cinemaName, cinemaInfo := range fwMovie.Cinemas { + displayName := cinemaName + if !strings.Contains(strings.ToLower(cinemaName), strings.ToLower(city)) { + displayName = fmt.Sprintf("%s", cinemaName) + } + + cinemaMutex.Lock() + cinemaModel, exists := cinemasMap[displayName] + if !exists { + cid := fmt.Sprintf("c%d", cinemaIDCounter) + cinemaIDCounter++ + + var latVal, lngVal float64 + var coordsModel *export.CoordsModel + if cinemaInfo.Coords != nil { + if lat, err := strconv.ParseFloat(cinemaInfo.Coords.Lat, 64); err == nil { + if lng, err := strconv.ParseFloat(cinemaInfo.Coords.Lng, 64); err == nil { + latVal = lat + lngVal = lng + coordsModel = &export.CoordsModel{ + Lat: latVal, + Lng: lngVal, + } + } + } + } + + cinemaModel = export.CinemaModel{ + ID: cid, + Name: displayName, + Address: cinemaInfo.Address, + Coords: coordsModel, + } + cinemasMap[displayName] = cinemaModel + } + cinemaMutex.Unlock() + + showtimesMutex.Lock() + dayShowtimes = append(dayShowtimes, export.ShowtimeModel{ + MovieID: mid, + CinemaID: cinemaModel.ID, + Times: cinemaInfo.Times, + }) + showtimesMutex.Unlock() + } + } + }() + } + + // Push jobs to workers + for city, movieList := range rawMoviesByCity { + for _, m := range movieList { + jobs <- resolveJob{city: city, fwMovie: m} + } + } + close(jobs) + + workerWg.Wait() + + if len(dayShowtimes) > 0 { + finalShowtimes[pageDate] = dayShowtimes + log.Printf("✅ Day offset %d completed with %d showtimes.\n", dayOffset, len(dayShowtimes)) + } + } + + // Prepare final lists + var moviesList []export.MovieModel + for _, m := range moviesMap { + moviesList = append(moviesList, m) + } + // Sort by ID to keep output consistent and clean + sort.Slice(moviesList, func(i, j int) bool { + idI, _ := strconv.Atoi(moviesList[i].ID[1:]) + idJ, _ := strconv.Atoi(moviesList[j].ID[1:]) + return idI < idJ + }) + + var cinemasList []export.CinemaModel + for _, c := range cinemasMap { + cinemasList = append(cinemasList, c) + } + sort.Slice(cinemasList, func(i, j int) bool { + idI, _ := strconv.Atoi(cinemasList[i].ID[1:]) + idJ, _ := strconv.Atoi(cinemasList[j].ID[1:]) + return idI < idJ + }) + + // Deduplicate failures by title and reason + seenFailures := make(map[string]bool) + var uniqueFailures []export.FailureModel + for _, f := range failures { + key := fmt.Sprintf("%s_%s", f.Title, f.Reason) + if !seenFailures[key] { + uniqueFailures = append(uniqueFailures, f) + seenFailures[key] = true + } + } + + var availableDates []string + for d := range finalShowtimes { + availableDates = append(availableDates, d) + } + sort.Strings(availableDates) + + metadata := export.MetadataModel{ + LastScrape: time.Now().UTC().Format(time.RFC3339), + TotalMovies: len(moviesList), + AvailableDates: availableDates, + Failures: uniqueFailures, + } + + log.Printf("💾 Exporting data to %s...\n", outputPath) + if err := export.ExportToJSON(moviesList, cinemasList, finalShowtimes, metadata, outputPath); err != nil { + log.Fatalf("❌ Export failed: %v\n", err) + } + + log.Println("✨ Scraping and export completed successfully!") +} diff --git a/scraper_go/go.mod b/scraper_go/go.mod new file mode 100644 index 0000000..ab25250 --- /dev/null +++ b/scraper_go/go.mod @@ -0,0 +1,27 @@ +module cmd/scraper/main.go + +go 1.25.11 + +require ( + github.com/gocolly/colly/v2 v2.3.0 + golang.org/x/text v0.38.0 +) + +require ( + github.com/PuerkitoBio/goquery v1.11.0 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/antchfx/htmlquery v1.3.5 // indirect + github.com/antchfx/xmlquery v1.5.0 // indirect + github.com/antchfx/xpath v1.3.5 // indirect + github.com/bits-and-blooms/bitset v1.24.4 // indirect + github.com/gobwas/glob v0.2.3 // indirect + github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect + github.com/nlnwa/whatwg-url v0.6.2 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/temoto/robotstxt v1.1.2 // indirect + golang.org/x/net v0.47.0 // indirect + google.golang.org/appengine v1.6.8 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) diff --git a/scraper_go/go.sum b/scraper_go/go.sum new file mode 100644 index 0000000..1acf69f --- /dev/null +++ b/scraper_go/go.sum @@ -0,0 +1,123 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/antchfx/htmlquery v1.3.5 h1:aYthDDClnG2a2xePf6tys/UyyM/kRcsFRm+ifhFKoU0= +github.com/antchfx/htmlquery v1.3.5/go.mod h1:5oyIPIa3ovYGtLqMPNjBF2Uf25NPCKsMjCnQ8lvjaoA= +github.com/antchfx/xmlquery v1.5.0 h1:uAi+mO40ZWfyU6mlUBxRVvL6uBNZ6LMU4M3+mQIBV4c= +github.com/antchfx/xmlquery v1.5.0/go.mod h1:lJfWRXzYMK1ss32zm1GQV3gMIW/HFey3xDZmkP1SuNc= +github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ= +github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.24.4 h1:95H15Og1clikBrKr/DuzMXkQzECs1M6hhoGXLwLQOZE= +github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly/v2 v2.3.0 h1:HSFh0ckbgVd2CSGRE+Y/iA4goUhGROJwyQDCMXGFBWM= +github.com/gocolly/colly/v2 v2.3.0/go.mod h1:Qp54s/kQbwCQvFVx8KzKCSTXVJ1wWT4QeAKEu33x1q8= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= +github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q= +github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.8 h1:IhEN5q69dyKagZPYMSdIjS2HqprW324FRQZJcGqPAsM= +google.golang.org/appengine v1.6.8/go.mod h1:1jJ3jBArFh5pcgW8gCtRJnepW8FzD1V44FJffLiz/Ds= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/scraper_go/internal/export/export.go b/scraper_go/internal/export/export.go new file mode 100644 index 0000000..ac43ab3 --- /dev/null +++ b/scraper_go/internal/export/export.go @@ -0,0 +1,131 @@ +package export + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" +) + +type MovieModel struct { + ID string `json:"id"` + Title string `json:"title"` + Poster *string `json:"poster"` + BoxdURI string `json:"boxd_uri"` +} + +type CoordsModel struct { + Lat float64 `json:"lat"` + Lng float64 `json:"lng"` +} + +type CinemaModel struct { + ID string `json:"id"` + Name string `json:"name"` + Address string `json:"address"` + Coords *CoordsModel `json:"coords"` +} + +type ShowtimeModel struct { + MovieID string `json:"movie_id"` + CinemaID string `json:"cinema_id"` + Times []string `json:"times"` +} + +type FailureModel struct { + Title string `json:"title"` + Reason string `json:"reason"` + Details *string `json:"details"` +} + +type MetadataModel struct { + LastScrape string `json:"last_scrape"` + TotalMovies int `json:"total_movies"` + AvailableDates []string `json:"available_dates"` + Failures []FailureModel `json:"failures"` +} + +type ExportSchema struct { + Movies []MovieModel `json:"movies"` + Cinemas []CinemaModel `json:"cinemas"` + Showtimes map[string][]ShowtimeModel `json:"showtimes"` + Metadata MetadataModel `json:"metadata"` +} + +func (s *ExportSchema) Validate() error { + for _, m := range s.Movies { + if m.ID == "" { + return fmt.Errorf("movie ID cannot be empty") + } + if m.Title == "" { + return fmt.Errorf("movie title cannot be empty") + } + if m.BoxdURI == "" { + return fmt.Errorf("movie boxd_uri cannot be empty") + } + if !strings.HasPrefix(m.BoxdURI, "https://boxd.it/") { + return fmt.Errorf("movie boxd_uri must start with https://boxd.it/, got: %s", m.BoxdURI) + } + } + + for _, c := range s.Cinemas { + if c.ID == "" { + return fmt.Errorf("cinema ID cannot be empty") + } + if c.Name == "" { + return fmt.Errorf("cinema name cannot be empty") + } + if c.Address == "" { + return fmt.Errorf("cinema address cannot be empty") + } + } + + for date, list := range s.Showtimes { + if date == "" { + return fmt.Errorf("showtime date cannot be empty") + } + for _, st := range list { + if st.MovieID == "" { + return fmt.Errorf("showtime movie_id cannot be empty") + } + if st.CinemaID == "" { + return fmt.Errorf("showtime cinema_id cannot be empty") + } + if len(st.Times) == 0 { + return fmt.Errorf("showtime times list cannot be empty") + } + } + } + + return nil +} + +func ExportToJSON(movies []MovieModel, cinemas []CinemaModel, showtimes map[string][]ShowtimeModel, metadata MetadataModel, outputFile string) error { + schema := ExportSchema{ + Movies: movies, + Cinemas: cinemas, + Showtimes: showtimes, + Metadata: metadata, + } + + if err := schema.Validate(); err != nil { + return fmt.Errorf("validation failed: %w", err) + } + + // Create directory if it doesn't exist + dir := filepath.Dir(outputFile) + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + + file, err := os.Create(outputFile) + if err != nil { + return err + } + defer file.Close() + + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") + return encoder.Encode(schema) +} diff --git a/scraper_go/internal/export/export_test.go b/scraper_go/internal/export/export_test.go new file mode 100644 index 0000000..12b712a --- /dev/null +++ b/scraper_go/internal/export/export_test.go @@ -0,0 +1,106 @@ +package export + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" +) + +func TestExportSchema_Validate(t *testing.T) { + // 1. Valid schema + schema := ExportSchema{ + Movies: []MovieModel{ + { + ID: "m1", + Title: "Project Hail Mary", + BoxdURI: "https://boxd.it/abc", + }, + }, + Cinemas: []CinemaModel{ + { + ID: "c1", + Name: "Kinoteka Warszawa", + Address: "Plac Defilad 1", + }, + }, + Showtimes: map[string][]ShowtimeModel{ + "2026-07-02": { + { + MovieID: "m1", + CinemaID: "c1", + Times: []string{"14:30", "18:00"}, + }, + }, + }, + Metadata: MetadataModel{ + LastScrape: "2026-07-02T12:00:00Z", + TotalMovies: 1, + AvailableDates: []string{"2026-07-02"}, + }, + } + + if err := schema.Validate(); err != nil { + t.Fatalf("Validation failed for a valid schema: %v", err) + } + + // 2. Invalid schema (missing BoxdURI prefix) + schema.Movies[0].BoxdURI = "https://letterboxd.com/film/abc" + if err := schema.Validate(); err == nil { + t.Error("Expected validation error for invalid BoxdURI prefix, got nil") + } + + // Reset + schema.Movies[0].BoxdURI = "https://boxd.it/abc" + + // 3. Invalid schema (empty Movie ID) + schema.Movies[0].ID = "" + if err := schema.Validate(); err == nil { + t.Error("Expected validation error for empty movie ID, got nil") + } +} + +func TestExportToJSON(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "export-test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + movies := []MovieModel{ + {ID: "m1", Title: "Title 1", BoxdURI: "https://boxd.it/abc"}, + } + cinemas := []CinemaModel{ + {ID: "c1", Name: "Cinema 1", Address: "Addr 1"}, + } + showtimes := map[string][]ShowtimeModel{ + "2026-07-02": {{MovieID: "m1", CinemaID: "c1", Times: []string{"12:00"}}}, + } + metadata := MetadataModel{ + LastScrape: "2026-07-02T12:00:00Z", + TotalMovies: 1, + AvailableDates: []string{"2026-07-02"}, + } + + outputFile := filepath.Join(tmpDir, "data_go.json") + + err = ExportToJSON(movies, cinemas, showtimes, metadata, outputFile) + if err != nil { + t.Fatalf("ExportToJSON failed: %v", err) + } + + // Read and verify the file content + data, err := os.ReadFile(outputFile) + if err != nil { + t.Fatalf("Failed to read output file: %v", err) + } + + var parsed ExportSchema + if err := json.Unmarshal(data, &parsed); err != nil { + t.Fatalf("Failed to unmarshal output: %v", err) + } + + if len(parsed.Movies) != 1 || parsed.Movies[0].Title != "Title 1" { + t.Errorf("Unexpected movie in exported JSON: %+v", parsed.Movies) + } +} diff --git a/scraper_go/internal/filmweb/filmweb.go b/scraper_go/internal/filmweb/filmweb.go new file mode 100644 index 0000000..2cae7e0 --- /dev/null +++ b/scraper_go/internal/filmweb/filmweb.go @@ -0,0 +1,191 @@ +package filmweb + +import ( + "fmt" + "strconv" + "strings" + "time" + + "github.com/gocolly/colly/v2" +) + +type FilmwebCinema struct { + Address string `json:"address"` + Times []string `json:"times"` + Coords *Coords `json:"coords"` +} + +type Coords struct { + Lat string `json:"lat"` + Lng string `json:"lng"` +} + +type FilmwebMovie struct { + Title string `json:"title"` + OriginalTitle string `json:"original_title"` + Year int `json:"year"` + Cinemas map[string]*FilmwebCinema `json:"cinemas"` +} + +type FilmwebResult struct { + Date string `json:"date"` + Movies []*FilmwebMovie `json:"movies"` +} + +type FilmwebScraper struct { + BaseURL string +} + +func NewFilmwebScraper() *FilmwebScraper { + return &FilmwebScraper{ + BaseURL: "https://www.filmweb.pl", + } +} + +// Scrape fetches showtimes and cinema details from Filmweb for a given city and day offset. +func (s *FilmwebScraper) Scrape(city string, dayOffset int, limit int) (*FilmwebResult, error) { + url := fmt.Sprintf("%s/showtimes/%s", s.BaseURL, city) + if dayOffset > 0 { + url = fmt.Sprintf("%s?day=%d", url, dayOffset) + } + + c := colly.NewCollector( + colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"), + ) + + type scrapedMovieLink struct { + Title string + URL string + } + + var movieLinks []scrapedMovieLink + + c.OnHTML(".preview__title a", func(e *colly.HTMLElement) { + title := strings.TrimSpace(e.Text) + href := e.Attr("href") + if title != "" && href != "" { + movieLinks = append(movieLinks, scrapedMovieLink{Title: title, URL: href}) + } + }) + + err := c.Visit(url) + if err != nil { + return nil, fmt.Errorf("failed to visit city page: %w", err) + } + + if len(movieLinks) == 0 { + pageDate := time.Now().AddDate(0, 0, dayOffset).Format("2006-01-02") + return &FilmwebResult{ + Date: pageDate, + Movies: []*FilmwebMovie{}, + }, nil + } + + if limit > 0 && len(movieLinks) > limit { + movieLinks = movieLinks[:limit] + } + + type movieResult struct { + movie *FilmwebMovie + err error + } + + resultsChan := make(chan movieResult, len(movieLinks)) + + for _, link := range movieLinks { + go func(title, movieURL string) { + fullMovieURL := movieURL + if !strings.HasPrefix(movieURL, "http://") && !strings.HasPrefix(movieURL, "https://") { + fullMovieURL = s.BaseURL + movieURL + } + showtimesURL := fmt.Sprintf("%s/showtimes/%s", fullMovieURL, city) + if dayOffset > 0 { + showtimesURL = fmt.Sprintf("%s?day=%d", showtimesURL, dayOffset) + } + + movieCollector := colly.NewCollector( + colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"), + ) + + movie := &FilmwebMovie{ + Title: title, + Cinemas: make(map[string]*FilmwebCinema), + } + + movieCollector.OnHTML(".preview__alternateTitle", func(e *colly.HTMLElement) { + movie.OriginalTitle = strings.TrimSpace(e.Text) + }) + + movieCollector.OnHTML(".preview__year", func(e *colly.HTMLElement) { + yearStr := strings.TrimSpace(e.Text) + if val, err := strconv.Atoi(yearStr); err == nil { + movie.Year = val + } + }) + + movieCollector.OnHTML(".seanceTiles", func(e *colly.HTMLElement) { + cinemaName := strings.TrimSpace(e.ChildText(".seanceTiles__title")) + if cinemaName == "" { + return + } + + address := strings.TrimSpace(e.ChildText(".seanceTiles__address")) + if address == "" { + address = fmt.Sprintf("%s, Poland", city) + } + + lat := e.Attr("data-cinema-latitude") + lng := e.Attr("data-cinema-longitude") + + var times []string + e.ForEach(".seanceTile__value", func(i int, item *colly.HTMLElement) { + timeVal := strings.TrimSpace(item.Text) + if timeVal != "" { + times = append(times, timeVal) + } + }) + + if len(times) > 0 { + var coords *Coords + if lat != "" && lng != "" { + coords = &Coords{ + Lat: lat, + Lng: lng, + } + } + movie.Cinemas[cinemaName] = &FilmwebCinema{ + Address: address, + Times: times, + Coords: coords, + } + } + }) + + err := movieCollector.Visit(showtimesURL) + if err != nil { + resultsChan <- movieResult{err: fmt.Errorf("failed to visit movie %s: %w", title, err)} + return + } + + resultsChan <- movieResult{movie: movie} + }(link.Title, link.URL) + } + + movies := make([]*FilmwebMovie, 0, len(movieLinks)) + for i := 0; i < len(movieLinks); i++ { + res := <-resultsChan + if res.err != nil { + fmt.Printf("Error scraping movie showtimes: %v\n", res.err) + continue + } + if res.movie != nil { + movies = append(movies, res.movie) + } + } + + pageDate := time.Now().AddDate(0, 0, dayOffset).Format("2006-01-02") + return &FilmwebResult{ + Date: pageDate, + Movies: movies, + }, nil +} diff --git a/scraper_go/internal/filmweb/filmweb_test.go b/scraper_go/internal/filmweb/filmweb_test.go new file mode 100644 index 0000000..c226328 --- /dev/null +++ b/scraper_go/internal/filmweb/filmweb_test.go @@ -0,0 +1,201 @@ +package filmweb + +import ( + "fmt" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestFilmwebScraper_Scrape(t *testing.T) { + // Setup a mock HTTP server to simulate Filmweb + mux := http.NewServeMux() + + // 1. Mock the city showtimes page + mux.HandleFunc("/showtimes/Warszawa", func(w http.ResponseWriter, r *http.Request) { + // Respect optional day query param + day := r.URL.Query().Get("day") + if day != "" && day != "0" { + fmt.Fprintf(w, ` + +
+ + + + `) + return + } + + fmt.Fprintf(w, ` + + + + + + + `) + }) + + // 2. Mock individual movie showtimes pages + mux.HandleFunc("/film/Projekt+Hail+Mary-2026-10047841/showtimes/Warszawa", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, ` + + +