Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--require spec_helper
--format documentation
--color
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# frozen_string_literal: true

source "https://rubygems.org"

gem 'nokolexbor'
gem 'rspec'
48 changes: 48 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
GEM
remote: https://rubygems.org/
specs:
coderay (1.1.3)
diff-lcs (1.6.2)
io-console (0.8.2)
method_source (1.1.0)
nokolexbor (0.6.2)
nokolexbor (0.6.2-arm64-darwin)
nokolexbor (0.6.2-x86_64-darwin)
nokolexbor (0.6.2-x86_64-linux)
pry (0.16.0)
coderay (~> 1.1)
method_source (~> 1.0)
reline (>= 0.6.0)
reline (0.6.3)
io-console (~> 0.5)
rspec (3.13.2)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.6)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.5)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.7)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.6)

PLATFORMS
aarch64-linux-gnu
aarch64-linux-musl
arm-linux-gnu
arm-linux-musl
arm64-darwin
x86_64-darwin
x86_64-linux-gnu
x86_64-linux-musl

DEPENDENCIES
nokolexbor
pry
rspec

BUNDLED WITH
2.7.2
71 changes: 71 additions & 0 deletions files/jackson5-members/expected-array.json

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions files/jackson5-members/index.html

Large diffs are not rendered by default.

313 changes: 313 additions & 0 deletions files/kanye-albums/expected-array.json

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions files/kanye-albums/index.html

Large diffs are not rendered by default.

141 changes: 141 additions & 0 deletions files/lakers-roaster/expected-array.json

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions files/lakers-roaster/index.html

Large diffs are not rendered by default.

412 changes: 412 additions & 0 deletions files/stranger-things-cast/expected-array.json

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions files/stranger-things-cast/index.html

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
59 changes: 59 additions & 0 deletions lib/google_carousel_parser.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
require 'nokolexbor'

class GoogleCarouselParser
attr_reader :html, :attr_id

ATTR_ID_REGEX = %r{kc:/\w*/\w*:\w*}

def initialize(html)
@attr_id = html.match(ATTR_ID_REGEX).to_a.first
@html = Nokolexbor::HTML(html)
end

def call
carousel_block = html.at_css("div[data-attrid='#{attr_id}']")
heading = html.at_css('[role="tab"][aria-selected="true"]')&.text&.downcase || extract_heading(carousel_block)
data = carousel_block.css('a').map { |el| parse_element(el) }.compact

{ heading => data }
end

private

def parse_element(el)
img_attrs = el.at_css('img')&.attributes
return unless img_attrs
name, *extensions = el.css('::text').map(&:text)
img_src = img_attrs.key?('id') ? image_sources[img_attrs['id'].value] : img_attrs['data-src'].value

attrs = { 'link' => construct_link(el), 'name' => name, 'image' => img_src }
attrs.merge!('extensions' => extensions) unless extensions.empty?
attrs
end

def construct_link(el)
val = el.attributes['href'].value
val.start_with?('http') ? val : "https://www.google.com#{val}"
end

def extract_heading(carousel)
heading = carousel.parent.at_css("div[role='heading']")
return heading.css('::text').last.text.downcase if heading

extract_heading(carousel.parent)
end

def image_sources
@image_sources ||= begin
scripts = html.css('script').map(&:text).select { _1.include?('_setImagesSrc') }
scripts.each_with_object({}) do |script, acc|
src, id = script.match(/.*(data:image.*)';.*\['([\w-]*)'\];/)&.captures
acc[id] = replace_js_escape_sequences(src) if src && id
end
end
end

def replace_js_escape_sequences(src)
src.gsub(/\\x([0-9a-fA-F]{2})/) { |match| match[2..-1].hex.chr }
end
end
17 changes: 17 additions & 0 deletions spec/lib/google_carousel_parser_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
require 'spec_helper'
require_relative '../../lib/google_carousel_parser'

RSpec.describe GoogleCarouselParser do
subject(:results) { described_class.new(html).call }

Dir[File.expand_path('files/*')].entries.each do |folder|
context "when carousel is #{folder.split('/').last}" do
let(:html) { File.read("#{folder}/index.html") }
let(:json){ JSON.parse(File.read("#{folder}/expected-array.json")) }

it 'parses correctly' do
expect(results).to eq json
end
end
end
end
85 changes: 85 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
require 'json'

RSpec.configure do |config|
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end

# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end

# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups

# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
=begin
# This allows you to limit a spec run to individual examples or groups
# you care about by tagging them with `:focus` metadata. When nothing
# is tagged with `:focus`, all examples get run. RSpec also provides
# aliases for `it`, `describe`, and `context` that include `:focus`
# metadata: `fit`, `fdescribe` and `fcontext`, respectively.
config.filter_run_when_matching :focus

# Allows RSpec to persist some state between runs in order to support
# the `--only-failures` and `--next-failure` CLI options. We recommend
# you configure your source control system to ignore this file.
config.example_status_persistence_file_path = "spec/examples.txt"

# Limits the available syntax to the non-monkey patched syntax that is
# recommended. For more details, see:
# https://rspec.info/features/3-12/rspec-core/configuration/zero-monkey-patching-mode/
config.disable_monkey_patching!

# This setting enables warnings. It's recommended, but in some cases may
# be too noisy due to issues in dependencies.
config.warnings = true

# Many RSpec users commonly either run the entire suite or an individual
# file, and it's useful to allow more verbose output when running an
# individual spec file.
if config.files_to_run.one?
# Use the documentation formatter for detailed output,
# unless a formatter has already been configured
# (e.g. via a command-line flag).
config.default_formatter = "doc"
end

# Print the 10 slowest examples and example groups at the
# end of the spec run, to help surface which specs are running
# particularly slow.
config.profile_examples = 10

# Run specs in random order to surface order dependencies. If you find an
# order dependency and want to debug it, you can fix the order by providing
# the seed, which is printed after each run.
# --seed 1234
config.order = :random

# Seed global randomization in this process using the `--seed` CLI option.
# Setting this allows you to use `--seed` to deterministically reproduce
# test failures related to randomization by passing the same `--seed` value
# as the one that triggered the failure.
Kernel.srand config.seed
=end
end