serpapi · nurtoltor · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,13 @@
+source "https://rubygems.org"
+
+ruby "3.4.6"
+
+gem 'nokogiri'
+
+group :development do
+  gem 'debug'
+end
+
+group :test do
+  gem 'rspec'
+end
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,79 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    date (3.5.1)
+    debug (1.11.1)
+      irb (~> 1.10)
+      reline (>= 0.3.8)
+    diff-lcs (1.6.2)
+    erb (6.0.1)
+    io-console (0.8.2)
+    irb (1.16.0)
+      pp (>= 0.6.0)
+      rdoc (>= 4.0.0)
+      reline (>= 0.4.2)
+    nokogiri (1.19.0-aarch64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-aarch64-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm-linux-musl)
+      racc (~> 1.4)
+    nokogiri (1.19.0-arm64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-linux-gnu)
+      racc (~> 1.4)
+    nokogiri (1.19.0-x86_64-linux-musl)
+      racc (~> 1.4)
+    pp (0.6.3)
+      prettyprint
+    prettyprint (0.2.0)
+    psych (5.3.1)
+      date
+      stringio
+    racc (1.8.1)
+    rdoc (7.1.0)
+      erb
+      psych (>= 4.0.0)
+      tsort
+    reline (0.6.3)
+      io-console (~> 0.5)
+    rspec (3.13.2)
+      rspec-core (~> 3.13.0)
+      rspec-expectations (~> 3.13.0)
+      rspec-mocks (~> 3.13.0)
+    rspec-core (3.13.6)
+      rspec-support (~> 3.13.0)
+    rspec-expectations (3.13.5)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-mocks (3.13.7)
+      diff-lcs (>= 1.2.0, < 2.0)
+      rspec-support (~> 3.13.0)
+    rspec-support (3.13.6)
+    stringio (3.2.0)
+    tsort (0.2.0)
+
+PLATFORMS
+  aarch64-linux-gnu
+  aarch64-linux-musl
+  arm-linux-gnu
+  arm-linux-musl
+  arm64-darwin
+  x86_64-darwin
+  x86_64-linux-gnu
+  x86_64-linux-musl
+
+DEPENDENCIES
+  debug
+  nokogiri
+  rspec
+
+RUBY VERSION
+   ruby 3.4.6p54
+
+BUNDLED WITH
+   2.6.9
diff --git a/README.md b/README.md
@@ -21,8 +21,53 @@ Parse directly the HTML result page ([html file]) in this repository. No extra H
 [html file]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/van-gogh-paintings.html
 [expected array]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/expected-array.json
 
-Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). 
+Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed).
 
 Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to be paintings.)
 
 The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want.
+
+## Solution
+
+I built a small carousel scraper using `Nokogiri` and regex. It finds the best Knowledge Graph carousel container by looking for `data-attrid` sections and links that include `stick=`, then extracts fields for each item. I prioritised semantic HTML (`role`, `aria-label`, `alt`, `title`) over class names. It also skips "show more" items and only includes images already present in the HTML (`data:image`, `encrypted-tbn`, or `knowledgecard` icons).
+
+The output is a hash where the key matches the search results selected tab (e.g., `artworks`, `cast`, `albums`). If no tab is selected, it defaults to `results`.
+
+### Structure
+
+- `lib/carousel_scraper.rb`: Orchestrates the extraction and chooses the correct carousel scope.
+- `lib/carousel_item_extractor.rb`: Extracts name, extensions, link, and image from a single item link.
+
+I tested against 3 other result pages to find common patterns:
+
+- "David Bowie albums" search: files/david-bowie-albums.html
+- "George Orwell books" search: files/george-orwell-books.html
+- "Lord of the Rings cast" search: files/lord-of-the-rings-cast.html
+
+### How to run
+
+Install dependencies:
+
+```
+bundle install
+```
+
+Run with the default Van Gogh paintings HTML (outputs to `files/van-gogh-paintings-expected-array.json`):
+
+```
+ruby main.rb
+```
+
+Run with a specific HTML file (outputs JSON to the same directory):
+
+```
+ruby main.rb files/david-bowie-albums.html   # files/david-bowie-albums-expected-array.json
+ruby main.rb files/george-orwell-books.html  # files/george-orwell-books-expected-array.json
+ruby main.rb files/lord-of-the-rings-cast.html  # files/lord-of-the-rings-cast-expected-array.json
+```
+
+Run the tests:
+
+```
+bundle exec rspec
+```
diff --git a/files/david-bowie-albums.html b/files/david-bowie-albums.html
diff --git a/files/george-orwell-books.html b/files/george-orwell-books.html
diff --git a/files/lord-of-the-rings-cast.html b/files/lord-of-the-rings-cast.html
diff --git a/lib/carousel_item_extractor.rb b/lib/carousel_item_extractor.rb
@@ -0,0 +1,167 @@
+class CarouselItemExtractor
+  def initialize(raw_html)
+    @raw_html = raw_html
+    @image_cache = nil
+  end
+
+  def extract(link)
+    name = extract_name(link)
+    return nil if name.nil? || is_action_text?(name)
+
+    {
+      name: name,
+      extensions: extract_extensions(link, name),
+      link: link['href']&.start_with?('http') ? link['href'] : "https://www.google.com#{link['href']}",
+      image: extract_image(link)
+    }.compact
+  end
+
+  def carousel_item_link?(link)
+    href = link['href'].to_s
+    return false if href.empty? || is_action_text?(link.text)
+    return false unless href.include?('stick=')
+
+    has_visual = link.at_css('img') || link.at_css('[role="heading"]')
+    has_label = link['aria-label'] && !link['aria-label'].strip.empty?
+    has_text = link.text && !link.text.strip.empty?
+
+    has_visual || has_label || has_text
+  end
+
+  private
+
+  def extract_name(link)
+    candidates = []
+
+    heading = link.at_css('[role="heading"]')
+    candidates << heading.text if heading && !heading.text.strip.empty?
+
+    candidates << link['aria-label'] if link['aria-label']
+
+    img = link.at_css('img[alt]')
+    candidates << img['alt'] if img && img['alt'] && !img['alt'].empty?
+
+    candidates << link['title'] if link['title']
+
+    title_el = link.at_css('.JjtOHd')
+    candidates << title_el.text if title_el
+
+    pg_el = link.at_css('.pgNMRc')
+    candidates << pg_el.text if pg_el
+
+    direct_text = link.children.select(&:text?).map { |n| n.text.strip }.reject(&:empty?).first
+    candidates << direct_text if direct_text
+
+    candidates.each do |text|
+      cleaned = clean_text(text)
+      next if cleaned.empty? || is_year?(cleaned)
+
+      return cleaned
+    end
+
+    link.traverse do |node|
+      next unless node.text?
+
+      cleaned = clean_text(node.text)
+      next if cleaned.empty? || is_year?(cleaned)
+
+      return cleaned
+    end
+
+    nil
+  end
+
+  def extract_extensions(link, name)
+    extensions = []
+
+    text_nodes = link.xpath('.//text()').map { |node| clean_text(node.text) }.reject(&:empty?).uniq
+    text_nodes.each do |text|
+      next if is_action_text?(text)
+      next if name && clean_text(name) == text
+      extensions << text unless text.empty? || extensions.include?(text)
+    end
+
+    link.css('.ellip, .cxzHyb').each do |el|
+      text = clean_text(el.text)
+      extensions << text unless text.empty? || extensions.include?(text)
+    end
+
+    if extensions.empty?
+      link.traverse do |node|
+        next unless node.text?
+
+        text = node.text.strip
+        if text.match?(/\A\d{4}\z/) && text.to_i.between?(1000, 2100)
+          extensions << text unless extensions.include?(text)
+        end
+      end
+    end
+
+    extensions.uniq
+  end
+
+  def extract_image(link)
+    img = link.at_css('img')
+    return nil unless img
+
+    src = img['src'].to_s
+    data_src = img['data-src'].to_s
+
+    if src.start_with?('data:image/jpeg', 'data:image/png', 'data:image/webp')
+      return src
+    end
+
+    if src.start_with?('https://encrypted-tbn', 'http://encrypted-tbn')
+      return src
+    end
+
+    if data_src.start_with?('data:image/jpeg', 'data:image/png', 'data:image/webp')
+      return data_src
+    end
+
+    if data_src.start_with?('https://encrypted-tbn', 'http://encrypted-tbn')
+      return data_src
+    end
+
+    if img['id'] && (img['data-deferred'] || src.include?('gif;base64'))
+      @image_cache ||= build_image_cache
+      deferred = @image_cache[img['id']]
+      return deferred if deferred
+    end
+
+    if src.include?('gstatic.com/knowledgecard/') && src.end_with?('.png')
+      return src.start_with?('//') ? "https:#{src}" : src
+    end
+
+    if data_src.include?('gstatic.com/knowledgecard/') && data_src.end_with?('.png')
+      return data_src.start_with?('//') ? "https:#{data_src}" : data_src
+    end
+
+    nil
+  end
+
+  def build_image_cache
+    cache = {}
+    @raw_html.scan(/\(function\(\)\{var s='(data:image\/[^']+)';var ii=\[([^\]]+)\]/) do |match|
+      base64_image = match[0]
+      ids_string = match[1]
+      ids_string.scan(/'([^']+)'/).flatten.each do |id|
+        cache[id] = base64_image
+      end
+    end
+    cache
+  end
+
+  def clean_text(text)
+    text.to_s.gsub(/\s+/, ' ').strip
+  end
+
+  def is_action_text?(text)
+    normalized = clean_text(text).downcase
+    normalized == 'show more' || normalized == 'see more' || normalized == 'more'
+  end
+
+  def is_year?(text)
+    text.match?(/\A\d{4}\z/) && text.to_i.between?(1000, 2100)
+  end
+end
diff --git a/lib/carousel_scraper.rb b/lib/carousel_scraper.rb
@@ -0,0 +1,47 @@
+require 'nokogiri'
+require_relative 'carousel_item_extractor'
+
+class CarouselScraper
+
+  def initialize(html)
+    @raw_html = html
+    @document = Nokogiri::HTML(html)
+  end
+
+  def extract
+    { section_key => carousel_items }
+  end
+
+  private
+
+  def section_key
+    selected_tab = @document.at_css('[role="tab"][aria-selected="true"]')
+    return 'results' unless selected_tab
+
+    label = selected_tab.text.gsub(/\s+/, ' ').strip.downcase
+
+    label.empty? ? 'results' : label
+  end
+
+  def carousel_items
+    items = []
+    item_extractor = CarouselItemExtractor.new(@raw_html)
+
+    carousel_item_links(@document).each do |link|
+      next unless item_extractor.carousel_item_link?(link)
+
+      item = item_extractor.extract(link)
+      items << item if item
+    end
+
+    items
+  end
+
+  def carousel_item_links(container)
+    kc_container_nodes = container.css('[data-attrid^="kc:/"]')
+    kc_container_nodes = container.css('[data-attrid*="kc:/"]') if kc_container_nodes.empty?
+
+    carousel_container = kc_container_nodes.max_by { |node| node.css('a[href*="stick="]').length }
+    (carousel_container || container).css('a[href*="stick="]').to_a
+  end
+end
diff --git a/main.rb b/main.rb
@@ -0,0 +1,13 @@
+require_relative 'lib/carousel_scraper'
+require 'json'
+
+file = ARGV[0] || 'files/van-gogh-paintings.html'
+
+html = File.read(file)
+scraper = CarouselScraper.new(html)
+result = scraper.extract
+
+output_file = file.sub('.html', '-expected-array.json')
+File.write(output_file, JSON.pretty_generate(result))
+
+puts "Expected array available in #{output_file}"