diff --git a/Gemfile b/Gemfile new file mode 100644 index 00000000..e8cf82d2 --- /dev/null +++ b/Gemfile @@ -0,0 +1,13 @@ +source "https://rubygems.org" + +ruby "3.4.6" + +gem 'nokogiri' + +group :development do + gem 'debug' +end + +group :test do + gem 'rspec' +end diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 00000000..2e177784 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,79 @@ +GEM + remote: https://rubygems.org/ + specs: + date (3.5.1) + debug (1.11.1) + irb (~> 1.10) + reline (>= 0.3.8) + diff-lcs (1.6.2) + erb (6.0.1) + io-console (0.8.2) + irb (1.16.0) + pp (>= 0.6.0) + rdoc (>= 4.0.0) + reline (>= 0.4.2) + nokogiri (1.19.0-aarch64-linux-gnu) + racc (~> 1.4) + nokogiri (1.19.0-aarch64-linux-musl) + racc (~> 1.4) + nokogiri (1.19.0-arm-linux-gnu) + racc (~> 1.4) + nokogiri (1.19.0-arm-linux-musl) + racc (~> 1.4) + nokogiri (1.19.0-arm64-darwin) + racc (~> 1.4) + nokogiri (1.19.0-x86_64-darwin) + racc (~> 1.4) + nokogiri (1.19.0-x86_64-linux-gnu) + racc (~> 1.4) + nokogiri (1.19.0-x86_64-linux-musl) + racc (~> 1.4) + pp (0.6.3) + prettyprint + prettyprint (0.2.0) + psych (5.3.1) + date + stringio + racc (1.8.1) + rdoc (7.1.0) + erb + psych (>= 4.0.0) + tsort + reline (0.6.3) + io-console (~> 0.5) + rspec (3.13.2) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.6) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.5) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.7) + diff-lcs (>= 1.2.0, < 2.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.6) + stringio (3.2.0) + tsort (0.2.0) + +PLATFORMS + aarch64-linux-gnu + aarch64-linux-musl + arm-linux-gnu + arm-linux-musl + arm64-darwin + x86_64-darwin + x86_64-linux-gnu + x86_64-linux-musl + +DEPENDENCIES + debug + nokogiri + rspec + +RUBY VERSION + ruby 3.4.6p54 + +BUNDLED WITH + 2.6.9 diff --git a/README.md b/README.md index 4d5a093f..05d30602 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,53 @@ Parse directly the HTML result page ([html file]) in this repository. No extra H [html file]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/van-gogh-paintings.html [expected array]: https://raw.githubusercontent.com/serpapi/code-challenge/master/files/expected-array.json -Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). +Add also to your array the painting thumbnails present in the result page file (not the ones where extra requests are needed). Test against 2 other similar result pages to make sure it works against different layouts. (Pages that contain the same kind of carrousel. Don't necessarily have to beĀ paintings.) The suggested time for this challenge is 4 hours. But, you can take your time and work more on it if you want. + +## Solution + +I built a small carousel scraper using `Nokogiri` and regex. It finds the best Knowledge Graph carousel container by looking for `data-attrid` sections and links that include `stick=`, then extracts fields for each item. I prioritised semantic HTML (`role`, `aria-label`, `alt`, `title`) over class names. It also skips "show more" items and only includes images already present in the HTML (`data:image`, `encrypted-tbn`, or `knowledgecard` icons). + +The output is a hash where the key matches the search results selected tab (e.g., `artworks`, `cast`, `albums`). If no tab is selected, it defaults to `results`. + +### Structure + +- `lib/carousel_scraper.rb`: Orchestrates the extraction and chooses the correct carousel scope. +- `lib/carousel_item_extractor.rb`: Extracts name, extensions, link, and image from a single item link. + +I tested against 3 other result pages to find common patterns: + +- "David Bowie albums" search: files/david-bowie-albums.html +- "George Orwell books" search: files/george-orwell-books.html +- "Lord of the Rings cast" search: files/lord-of-the-rings-cast.html + +### How to run + +Install dependencies: + +``` +bundle install +``` + +Run with the default Van Gogh paintings HTML (outputs to `files/van-gogh-paintings-expected-array.json`): + +``` +ruby main.rb +``` + +Run with a specific HTML file (outputs JSON to the same directory): + +``` +ruby main.rb files/david-bowie-albums.html # files/david-bowie-albums-expected-array.json +ruby main.rb files/george-orwell-books.html # files/george-orwell-books-expected-array.json +ruby main.rb files/lord-of-the-rings-cast.html # files/lord-of-the-rings-cast-expected-array.json +``` + +Run the tests: + +``` +bundle exec rspec +``` diff --git a/files/david-bowie-albums.html b/files/david-bowie-albums.html new file mode 100644 index 00000000..0e0e788a --- /dev/null +++ b/files/david-bowie-albums.html @@ -0,0 +1,49 @@ +david bowie albums - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Quick Settings
David Bowie
English singer-songwriter and actor

Notices about Filtered Results

In response to a complaint that we received under the US Digital Millennium Copyright Act, we have removed 1 result(s) from this page. If you wish, you may read the DMCA complaint that caused the removal(s) at LumenDatabase.org.
Some results may have been removed under data protection law in Europe. Learn more

Page navigation

About
Feedback
People also search for
Google apps
diff --git a/files/george-orwell-books.html b/files/george-orwell-books.html new file mode 100644 index 00000000..f0e29631 --- /dev/null +++ b/files/george-orwell-books.html @@ -0,0 +1,40 @@ +george orwell books - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Quick Settings
George Orwell
English novelist and poet
Google apps
diff --git a/files/lord-of-the-rings-cast.html b/files/lord-of-the-rings-cast.html new file mode 100644 index 00000000..703e22f8 --- /dev/null +++ b/files/lord-of-the-rings-cast.html @@ -0,0 +1,40 @@ +lord of the rings cast - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
Quick Settings
The Lord of the Rings: The Fellowship of the Ring
12
2001 ‧ Fantasy/Adventure ‧ 3h 28m

Search Results

Watch movie
EDIT SERVICES
Watched
Already watched
Thanks! This improves movie and show recommendations from Google
Something went wrong and your action wasn't saved
About
86% liked this film
Reviews aren't verified by Google
Google users
Review updated to 'like'
Review updated to 'dislike'
Feedback
People also search for
Google apps
diff --git a/lib/carousel_item_extractor.rb b/lib/carousel_item_extractor.rb new file mode 100644 index 00000000..1391f495 --- /dev/null +++ b/lib/carousel_item_extractor.rb @@ -0,0 +1,167 @@ +class CarouselItemExtractor + def initialize(raw_html) + @raw_html = raw_html + @image_cache = nil + end + + def extract(link) + name = extract_name(link) + return nil if name.nil? || is_action_text?(name) + + { + name: name, + extensions: extract_extensions(link, name), + link: link['href']&.start_with?('http') ? link['href'] : "https://www.google.com#{link['href']}", + image: extract_image(link) + }.compact + end + + def carousel_item_link?(link) + href = link['href'].to_s + return false if href.empty? || is_action_text?(link.text) + return false unless href.include?('stick=') + + has_visual = link.at_css('img') || link.at_css('[role="heading"]') + has_label = link['aria-label'] && !link['aria-label'].strip.empty? + has_text = link.text && !link.text.strip.empty? + + has_visual || has_label || has_text + end + + private + + def extract_name(link) + candidates = [] + + heading = link.at_css('[role="heading"]') + candidates << heading.text if heading && !heading.text.strip.empty? + + candidates << link['aria-label'] if link['aria-label'] + + img = link.at_css('img[alt]') + candidates << img['alt'] if img && img['alt'] && !img['alt'].empty? + + candidates << link['title'] if link['title'] + + title_el = link.at_css('.JjtOHd') + candidates << title_el.text if title_el + + pg_el = link.at_css('.pgNMRc') + candidates << pg_el.text if pg_el + + direct_text = link.children.select(&:text?).map { |n| n.text.strip }.reject(&:empty?).first + candidates << direct_text if direct_text + + candidates.each do |text| + cleaned = clean_text(text) + next if cleaned.empty? || is_year?(cleaned) + + return cleaned + end + + link.traverse do |node| + next unless node.text? + + cleaned = clean_text(node.text) + next if cleaned.empty? || is_year?(cleaned) + + return cleaned + end + + nil + end + + def extract_extensions(link, name) + extensions = [] + + text_nodes = link.xpath('.//text()').map { |node| clean_text(node.text) }.reject(&:empty?).uniq + text_nodes.each do |text| + next if is_action_text?(text) + next if name && clean_text(name) == text + extensions << text unless text.empty? || extensions.include?(text) + end + + link.css('.ellip, .cxzHyb').each do |el| + text = clean_text(el.text) + extensions << text unless text.empty? || extensions.include?(text) + end + + if extensions.empty? + link.traverse do |node| + next unless node.text? + + text = node.text.strip + if text.match?(/\A\d{4}\z/) && text.to_i.between?(1000, 2100) + extensions << text unless extensions.include?(text) + end + end + end + + extensions.uniq + end + + def extract_image(link) + img = link.at_css('img') + return nil unless img + + src = img['src'].to_s + data_src = img['data-src'].to_s + + if src.start_with?('data:image/jpeg', 'data:image/png', 'data:image/webp') + return src + end + + if src.start_with?('https://encrypted-tbn', 'http://encrypted-tbn') + return src + end + + if data_src.start_with?('data:image/jpeg', 'data:image/png', 'data:image/webp') + return data_src + end + + if data_src.start_with?('https://encrypted-tbn', 'http://encrypted-tbn') + return data_src + end + + if img['id'] && (img['data-deferred'] || src.include?('gif;base64')) + @image_cache ||= build_image_cache + deferred = @image_cache[img['id']] + return deferred if deferred + end + + if src.include?('gstatic.com/knowledgecard/') && src.end_with?('.png') + return src.start_with?('//') ? "https:#{src}" : src + end + + if data_src.include?('gstatic.com/knowledgecard/') && data_src.end_with?('.png') + return data_src.start_with?('//') ? "https:#{data_src}" : data_src + end + + nil + end + + def build_image_cache + cache = {} + @raw_html.scan(/\(function\(\)\{var s='(data:image\/[^']+)';var ii=\[([^\]]+)\]/) do |match| + base64_image = match[0] + ids_string = match[1] + ids_string.scan(/'([^']+)'/).flatten.each do |id| + cache[id] = base64_image + end + end + cache + end + + def clean_text(text) + text.to_s.gsub(/\s+/, ' ').strip + end + + def is_action_text?(text) + normalized = clean_text(text).downcase + normalized == 'show more' || normalized == 'see more' || normalized == 'more' + end + + def is_year?(text) + text.match?(/\A\d{4}\z/) && text.to_i.between?(1000, 2100) + end +end diff --git a/lib/carousel_scraper.rb b/lib/carousel_scraper.rb new file mode 100644 index 00000000..0a754f3b --- /dev/null +++ b/lib/carousel_scraper.rb @@ -0,0 +1,47 @@ +require 'nokogiri' +require_relative 'carousel_item_extractor' + +class CarouselScraper + + def initialize(html) + @raw_html = html + @document = Nokogiri::HTML(html) + end + + def extract + { section_key => carousel_items } + end + + private + + def section_key + selected_tab = @document.at_css('[role="tab"][aria-selected="true"]') + return 'results' unless selected_tab + + label = selected_tab.text.gsub(/\s+/, ' ').strip.downcase + + label.empty? ? 'results' : label + end + + def carousel_items + items = [] + item_extractor = CarouselItemExtractor.new(@raw_html) + + carousel_item_links(@document).each do |link| + next unless item_extractor.carousel_item_link?(link) + + item = item_extractor.extract(link) + items << item if item + end + + items + end + + def carousel_item_links(container) + kc_container_nodes = container.css('[data-attrid^="kc:/"]') + kc_container_nodes = container.css('[data-attrid*="kc:/"]') if kc_container_nodes.empty? + + carousel_container = kc_container_nodes.max_by { |node| node.css('a[href*="stick="]').length } + (carousel_container || container).css('a[href*="stick="]').to_a + end +end diff --git a/main.rb b/main.rb new file mode 100644 index 00000000..a6ea4eb5 --- /dev/null +++ b/main.rb @@ -0,0 +1,13 @@ +require_relative 'lib/carousel_scraper' +require 'json' + +file = ARGV[0] || 'files/van-gogh-paintings.html' + +html = File.read(file) +scraper = CarouselScraper.new(html) +result = scraper.extract + +output_file = file.sub('.html', '-expected-array.json') +File.write(output_file, JSON.pretty_generate(result)) + +puts "Expected array available in #{output_file}" diff --git a/spec/carousel_scraper_spec.rb b/spec/carousel_scraper_spec.rb new file mode 100644 index 00000000..bbab5c49 --- /dev/null +++ b/spec/carousel_scraper_spec.rb @@ -0,0 +1,188 @@ +require 'json' +require_relative '../lib/carousel_scraper' + +RSpec.describe 'Carousel Scraper' do + let(:files_path) { File.expand_path('../files', __dir__) } + + describe 'Van Gogh Paintings' do + before :all do + files_path = File.expand_path('../files', __dir__) + html = File.read(File.join(files_path, 'van-gogh-paintings.html')) + @result = CarouselScraper.new(html).extract + @items = @result['artworks'] + end + + it 'returns a hash with artworks key' do + expect(@result).to be_a(Hash) + expect(@result).to have_key('artworks') + end + + it 'extracts artworks array' do + expect(@items).to be_an(Array) + expect(@items).to_not be_empty + end + + it 'artworks - name' do + expect(@items[0][:name]).to be_a(String) + expect(@items[0][:name]).to_not be_empty + end + + it 'artworks - extensions' do + expect(@items[0][:extensions]).to be_a(Array) + expect(@items[0][:extensions]).to_not be_empty + end + + it 'artworks - link' do + expect(@items[0][:link]).to be_a(String) + expect(@items[0][:link]).to_not be_empty + end + + it 'artworks - image' do + expect(@items[0][:image]).to be_a(String) + expect(@items[0][:image]).to_not be_empty + end + + it 'includes a known artwork' do + names = @items.map { |a| a[:name] }.compact + expect(names).to include('The Starry Night') + end + + it 'does not include action tiles' do + names = @items.map { |a| a[:name].to_s.downcase } + expect(names).not_to include('more', 'show more', 'see more') + end + end + + describe 'David Bowie Albums' do + before :all do + files_path = File.expand_path('../files', __dir__) + html = File.read(File.join(files_path, 'david-bowie-albums.html')) + @result = CarouselScraper.new(html).extract + @items = @result['albums'] + end + + it 'returns a hash with albums key' do + expect(@result).to be_a(Hash) + expect(@result).to have_key('albums') + end + + it 'extracts albums array' do + expect(@items).to be_an(Array) + end + + it 'albums - name' do + expect(@items[0][:name]).to be_a(String) + expect(@items[0][:name]).to_not be_empty + end + + it 'albums - link' do + expect(@items[0][:link]).to be_a(String) + expect(@items[0][:link]).to_not be_empty + end + + it 'includes a known album' do + names = @items.map { |a| a[:name] }.compact + expect(names).to include('The Man Who Sold the World') + end + + it 'does not include action tiles' do + names = @items.map { |a| a[:name].to_s.downcase } + expect(names).not_to include('more', 'show more', 'see more') + end + end + + describe 'Lord of the Rings Cast' do + before :all do + files_path = File.expand_path('../files', __dir__) + html = File.read(File.join(files_path, 'lord-of-the-rings-cast.html')) + @result = CarouselScraper.new(html).extract + @items = @result['cast'] + end + + it 'returns a hash with cast key' do + expect(@result).to be_a(Hash) + expect(@result).to have_key('cast') + end + + it 'extracts cast array' do + expect(@items).to be_an(Array) + expect(@items).to_not be_empty + end + + it 'cast - name' do + expect(@items[0][:name]).to be_a(String) + expect(@items[0][:name]).to_not be_empty + end + + it 'cast - link' do + expect(@items[0][:link]).to be_a(String) + expect(@items[0][:link]).to_not be_empty + end + + it 'includes a known cast member' do + names = @items.map { |a| a[:name] }.compact + expect(names).to include('Elijah Wood') + end + + it 'does not include action tiles' do + names = @items.map { |a| a[:name].to_s.downcase } + expect(names).not_to include('more', 'show more', 'see more') + end + end + + describe 'George Orwell Books' do + before :all do + files_path = File.expand_path('../files', __dir__) + html = File.read(File.join(files_path, 'george-orwell-books.html')) + @result = CarouselScraper.new(html).extract + @items = @result['books'] + end + + it 'returns a hash with books key' do + expect(@result).to be_a(Hash) + expect(@result).to have_key('books') + end + + it 'extracts books array' do + expect(@items).to be_an(Array) + expect(@items).to_not be_empty + end + + it 'books - name' do + expect(@items[0][:name]).to be_a(String) + expect(@items[0][:name]).to_not be_empty + end + + it 'books - link' do + expect(@items[0][:link]).to be_a(String) + expect(@items[0][:link]).to_not be_empty + end + + it 'books - image' do + expect(@items[0][:image]).to be_a(String) + expect(@items[0][:image]).to_not be_empty + end + + it 'includes a known book' do + names = @items.map { |a| a[:name] }.compact + expect(names).to include('Homage to Catalonia') + end + + it 'does not include action tiles' do + names = @items.map { |a| a[:name].to_s.downcase } + expect(names).not_to include('more', 'show more', 'see more') + end + end + + describe 'Output format' do + before :all do + files_path = File.expand_path('../files', __dir__) + html = File.read(File.join(files_path, 'van-gogh-paintings.html')) + @result = CarouselScraper.new(html).extract + end + + it 'produces valid JSON output' do + expect { JSON.generate(@result) }.not_to raise_error + end + end +end