From 078c2ba17c55d7f58428ccc3d64f38fbd5d0fe84 Mon Sep 17 00:00:00 2001 From: Shibayan95 Date: Mon, 30 Mar 2026 16:07:21 +0530 Subject: [PATCH] Resolve conflict in cherry-pick of 667262992a3609e9ff295e2a96c0887a10ca8df4 and change the commit message --- server/Gemfile | 8 + server/Gemfile.lock | 16 + .../reverse_etl/extractors/web_scraping.rb | 58 ++ .../processors/text/chunk_processor.rb | 60 ++ .../processors/text/token_chunker.rb | 60 ++ server/lib/utils/constants.rb | 15 +- .../reports/data_app_summary_spec.rb | 524 ++++++++++++++++++ .../create_visual_component_spec.rb | 100 ++++ .../extractors/web_scraping_spec.rb | 180 +++++- .../processors/text/chunk_processor_spec.rb | 172 ++++++ .../processors/text/token_chunker_spec.rb | 176 ++++++ 11 files changed, 1367 insertions(+), 2 deletions(-) create mode 100644 server/lib/reverse_etl/processors/text/chunk_processor.rb create mode 100644 server/lib/reverse_etl/processors/text/token_chunker.rb create mode 100644 server/spec/enterprise/interactors/reports/data_app_summary_spec.rb create mode 100644 server/spec/enterprise/interactors/visual_components/create_visual_component_spec.rb create mode 100644 server/spec/lib/reverse_etl/processors/text/chunk_processor_spec.rb create mode 100644 server/spec/lib/reverse_etl/processors/text/token_chunker_spec.rb diff --git a/server/Gemfile b/server/Gemfile index bb94e30ae..fb03fcd3d 100644 --- a/server/Gemfile +++ b/server/Gemfile @@ -118,4 +118,12 @@ gem "rexml", "~> 3.4.2" gem "git", "~> 4.0" +<<<<<<< HEAD gem "pdf-reader", "~> 2.15" +======= +gem "ruby-openai", "~> 8.3" + +gem "paper_trail", "~> 17.0" + +gem "tokenizers", "~> 0.6.3" +>>>>>>> 667262992 (fix(CE): fix the issue for token limit solution for firecrawl (#1772)) diff --git a/server/Gemfile.lock b/server/Gemfile.lock index 8b7d05ce5..c2ee9cb45 100644 --- a/server/Gemfile.lock +++ b/server/Gemfile.lock @@ -2251,6 +2251,7 @@ GEM activerecord (>= 5.2) thor (1.4.0) timecop (0.9.8) +<<<<<<< HEAD timeout (0.4.3) tiny_tds (3.2.0) bigdecimal (~> 3) @@ -2259,6 +2260,20 @@ GEM tiny_tds (3.2.0-x86_64-linux-gnu) bigdecimal (~> 3) traces (0.15.2) +======= + timeout (0.6.1) + tiny_tds (3.4.0) + bigdecimal (>= 2.0.0) + tiny_tds (3.4.0-aarch64-linux-gnu) + bigdecimal (>= 2.0.0) + tiny_tds (3.4.0-x86_64-linux-gnu) + bigdecimal (>= 2.0.0) + tokenizers (0.6.3-aarch64-linux) + tokenizers (0.6.3-arm64-darwin) + tokenizers (0.6.3-x86_64-darwin) + tokenizers (0.6.3-x86_64-linux) + traces (0.18.2) +>>>>>>> 667262992 (fix(CE): fix the issue for token limit solution for firecrawl (#1772)) track_open_instances (0.1.15) trailblazer-option (0.1.2) ttfunk (1.8.0) @@ -2369,6 +2384,7 @@ DEPENDENCIES strong_migrations temporal-ruby! timecop + tokenizers (~> 0.6.3) tzinfo-data webrick xmlrpc (~> 0.3.3) diff --git a/server/lib/reverse_etl/extractors/web_scraping.rb b/server/lib/reverse_etl/extractors/web_scraping.rb index 0d93e715b..40d4d82b7 100644 --- a/server/lib/reverse_etl/extractors/web_scraping.rb +++ b/server/lib/reverse_etl/extractors/web_scraping.rb @@ -2,7 +2,10 @@ module ReverseEtl module Extractors + class ChunkProcessingError < StandardError; end + class WebScraping < Base + include ::Utils::Constants # TODO: Make it as class method def read(sync_run_id, activity) sync_run = SyncRun.find(sync_run_id) @@ -21,6 +24,33 @@ def read(sync_run_id, activity) private + def generate_chunk_config(sync_run) + chunk_config = { chunk_size: 1000, chunk_overlap: 200 } + mappings = sync_run.sync.configuration + vector_mappings = mappings.select { |mapping| mapping["mapping_type"] == "vector" } + unless vector_mappings.empty? + # Get the vector config with the smallest token limit + vector_config = vector_mappings + .select { |mapping| EMBEDDING_MODEL_TOKEN_LIMITS.key?(mapping["embedding_config"]["model"]) } + .min_by { |mapping| EMBEDDING_MODEL_TOKEN_LIMITS[mapping["embedding_config"]["model"]] } + unless vector_config.nil? + chunk_config = { + model: vector_config["embedding_config"]["model"], + provider: vector_config["embedding_config"]["mode"], + chunk_size: EMBEDDING_MODEL_TOKEN_LIMITS[vector_config["embedding_config"]["model"]] + } + end + end + chunk_config + end + + def generate_chunks(sync_run, markdown_content) + chunk_config = generate_chunk_config(sync_run) + ReverseEtl::Processors::Text::ChunkProcessor.new.process(chunk_config, markdown_content) + rescue StandardError => e + raise ReverseEtl::Extractors::ChunkProcessingError, "Failed to process file content: #{e.message}" + end + def fetch_records(sync_run) source_client = setup_source_client(sync_run.sync) result = source_client.read(sync_run.sync.to_protocol) @@ -44,9 +74,20 @@ def process_result(result, sync_run) model = sync_run.sync.model result.each do |res| record = res.record +<<<<<<< HEAD fingerprint = generate_fingerprint(record.data) sync_record = process_record(record, sync_run, model) skipped_rows += update_or_create_sync_record(sync_record, record, sync_run, fingerprint) ? 0 : 1 +======= + chunk_records = generate_chunks(sync_run, record.data[:markdown]) + chunk_records.map do |chunk_record| + new_record = build_record(chunk_record, record.data[:metadata]) + fingerprint = generate_fingerprint(new_record.data) + sync_record = process_record(new_record, sync_run, model) + total_query_rows += 1 + skipped_rows += update_or_create_sync_record(sync_record, new_record, sync_run, fingerprint) ? 0 : 1 + end +>>>>>>> 667262992 (fix(CE): fix the issue for token limit solution for firecrawl (#1772)) end sync_run.update( current_offset: 0, @@ -54,6 +95,23 @@ def process_result(result, sync_run) skipped_rows: ) end +<<<<<<< HEAD +======= + + def build_record(message, metadata) + record_data = message.with_indifferent_access + # Used for structured purposes when passing data to process_record + Multiwoven::Integrations::Protocol::RecordMessage.new( + data: { + markdown: record_data["text"], + markdown_hash: record_data["element_id"], + metadata:, + url: JSON.parse(metadata)["url"] + }, + emitted_at: Time.zone.now.to_i + ) + end +>>>>>>> 667262992 (fix(CE): fix the issue for token limit solution for firecrawl (#1772)) end end end diff --git a/server/lib/reverse_etl/processors/text/chunk_processor.rb b/server/lib/reverse_etl/processors/text/chunk_processor.rb new file mode 100644 index 000000000..c4acd6afb --- /dev/null +++ b/server/lib/reverse_etl/processors/text/chunk_processor.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +module ReverseEtl + module Processors + module Text + class ChunkProcessor + DEFAULT_CHUNK_SIZE = 1000 + DEFAULT_CHUNK_OVERLAP = 200 + + def chunk_processor(chunk_config) + if legacy?(chunk_config) + processor_type = ENV["CHUNK_PROCESSOR"] || "langchain_rb" + processor_class_name = "ReverseEtl::Processors::Text::#{processor_type.camelize}" + processor_class_name.constantize.new + else + ReverseEtl::Processors::Text::TokenChunker.new + end + end + + # Process content into chunks with metadata + def process(chunk_config, content, metadata = {}) + chunk_config[:chunk_size] ||= DEFAULT_CHUNK_SIZE + chunk_config[:chunk_overlap] ||= DEFAULT_CHUNK_OVERLAP + content = cleanup(content) + + chunks = chunk_processor(chunk_config).process(chunk_config, content) + format_chunks(chunks, metadata) + end + + private + + def format_chunks(chunks, metadata) + chunks.map do |chunk| + { + element_id: Digest::MD5.hexdigest(chunk), + text: chunk, + created_date: metadata[:file_created_date], + modified_date: metadata[:file_modified_date], + filename: metadata[:file_name], + filetype: metadata[:file_type], + created_at: Time.current + } + end + end + + def cleanup(text) + text + .gsub(/<[^>]+>/, " ") # Remove HTML tags + .gsub(/\s+/, " ") + .gsub(/\\\[\\\n/, " ") # Collapse whitespace and newlines + .strip # Trim leading/trailing spaces + end + + def legacy?(chunk_config) + chunk_config[:model].blank? && chunk_config[:provider].blank? + end + end + end + end +end diff --git a/server/lib/reverse_etl/processors/text/token_chunker.rb b/server/lib/reverse_etl/processors/text/token_chunker.rb new file mode 100644 index 000000000..b07c4a8c4 --- /dev/null +++ b/server/lib/reverse_etl/processors/text/token_chunker.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +module ReverseEtl + module Processors + module Text + class TokenChunker < BaseDocProcessor + include ::Utils::Constants + def process(chunk_config, content) + @model = chunk_config[:model] + @provider = chunk_config[:provider] + @chunk_size = chunk_config[:chunk_size] + validate_model + initialize_tokeniser + tokens = get_tokens(content) + tokens_to_text_chunks(tokens, @chunk_size) + end + + private + + def open_ai? + @provider == "open_ai" + end + + def tokens_to_text_chunks(arr, chunk_size) + arr.each_slice(chunk_size).map do |chunk| + if open_ai? + @tokeniser.decode(chunk) + else + @tokeniser.decode(chunk).join(" ") + end + end + end + + def validate_model + raise TypeError, "Model is required" if @model.blank? + raise TypeError, "Embedding model #{@model} not supported" unless EMBEDDING_MODEL_TOKEN_LIMITS.key?(@model) + end + + def initialize_tokeniser + raise TypeError, "Provider is required" if @provider.blank? + raise TypeError, "Provider #{@provider} not supported" unless SUPPORTED_PROVIDERS.include?(@provider) + + @tokeniser = if open_ai? + Tiktoken.encoding_for_model(@model) + else + Tokenizers::Tokenizer.from_pretrained("sentence-transformers/#{@model}") + end + end + + def get_tokens(text) + if open_ai? + @tokeniser.encode(text) + else + @tokeniser.encode(text).tokens + end + end + end + end + end +end diff --git a/server/lib/utils/constants.rb b/server/lib/utils/constants.rb index b6cbe87d5..05efe3b80 100644 --- a/server/lib/utils/constants.rb +++ b/server/lib/utils/constants.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true module Utils - module Constants + module Constants # rubocop:disable Metrics/ModuleLength # whenever resource group mappung is updated # enterpise role contracts need to be updated also RESOURCE_GROUP_MAPPING = { @@ -45,5 +45,18 @@ module Constants description: "Manage and access the AI assistant and its configurations" } }.freeze + + EMBEDDING_MODEL_TOKEN_LIMITS = { + "text-embedding-3-small" => 8191, + "text-embedding-3-large" => 8191, + "text-embedding-ada-002" => 8191, + "paraphrase-MiniLM-L12-v2" => 128, + "all-MiniLM-L6-v2" => 256, + "multi-qa-MiniLM-L6-cos-v1" => 512, + "all-mpnet-base-v2" => 384, + "msmarco-MiniLM-L6-cos-v5" => 512 + }.freeze + + SUPPORTED_PROVIDERS = %w[open_ai hugging_face].freeze end end diff --git a/server/spec/enterprise/interactors/reports/data_app_summary_spec.rb b/server/spec/enterprise/interactors/reports/data_app_summary_spec.rb new file mode 100644 index 000000000..af44840a0 --- /dev/null +++ b/server/spec/enterprise/interactors/reports/data_app_summary_spec.rb @@ -0,0 +1,524 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe Reports::DataAppSummary, type: :interactor do + let!(:workspace) { create(:workspace) } + let!(:data_app1) { create(:data_app, workspace:) } + let!(:data_app2) { create(:data_app, workspace:) } + let!(:data_app3) { create(:data_app, workspace:) } + let!(:data_app4) { create(:data_app, workspace:, rendering_type: "no_code") } + let!(:visual_component1) { data_app1.visual_components.first } + let!(:visual_component2) { data_app2.visual_components.first } + let!(:visual_component3) { create(:visual_component, component_type: "chat_bot", data_app: data_app3, workspace:) } + let!(:data_app_session1) { create(:data_app_session, workspace:, data_app: data_app1) } + let!(:data_app_session2) { create(:data_app_session, workspace:, data_app: data_app2, created_at: 1.day.ago) } + let!(:data_app_session3) { create(:data_app_session, workspace:, data_app: data_app3) } + let!(:feedback1) do + create(:feedback, workspace:, data_app: data_app1, visual_component: visual_component1) + end + let!(:feedback2) do + create(:feedback, workspace:, data_app: data_app2, visual_component: visual_component2, created_at: 1.day.ago) + end + let(:context) do + { + time_period: "one_week", rendering_type: "embed", workspace: + } + end + + before do + data_app3.visual_components.order(:id).first.update!(component_type: "chat_bot") + create(:chat_message, visual_component: visual_component3, role: 0, content: "Hi", session: data_app_session3) + create(:chat_message, visual_component: visual_component3, role: 1, content: "Hello! How can I help?", + session: data_app_session3) + create(:chat_message, visual_component: visual_component3, role: 0, content: "1+1", session: data_app_session3) + create(:chat_message, visual_component: visual_component3, role: 1, content: "1+1=2", session: data_app_session3) + create(:message_feedback, visual_component: visual_component3, workspace:, data_app: data_app3) + create(:message_feedback, visual_component: visual_component3, workspace:, data_app: data_app3) + create(:message_feedback, visual_component: visual_component3, workspace:, data_app: data_app3) + end + + describe "#call" do + subject { described_class.call(context) } + + context "with valid type and time_period" do + let(:time_period) { "one_week" } + + it "returns activity data for data apps within the specified time period" do + result = subject + expect(result).to be_a_success + expect(result.activity[:data_apps].size).to eq(3) + + data_app_report = result.activity[:data_apps].second + expect(data_app_report[:data_app_id]).to eq(data_app2.id) + expect(data_app_report[:data_app_name]).to eq(data_app2.name) + expect(data_app_report[:is_chat_bot]).to eq(false) + time_slice = data_app_report[:slices][5] + expect(time_slice[:session_count]).to eq(1) + expect(time_slice[:feedback_count]).to eq(1) + + data_app_report = result.activity[:data_apps].third + expect(data_app_report[:data_app_id]).to eq(data_app1.id) + expect(data_app_report[:data_app_name]).to eq(data_app1.name) + expect(data_app_report[:is_chat_bot]).to eq(false) + expect(data_app_report[:total_sessions]).to eq(1) + expect(data_app_report[:total_feedback_responses]).to eq(1) + expect(data_app_report[:slices].size).to be > 0 + time_slice = data_app_report[:slices].last + expect(time_slice[:session_count]).to eq(1) + expect(time_slice[:feedback_count]).to eq(1) + end + end + + context "with chat bot type and time_period" do + let(:time_period) { "one_week" } + + it "returns activity data for data apps within the specified time period" do + result = subject + expect(result).to be_a_success + expect(result.activity[:data_apps].size).to eq(3) + + data_app_report = result.activity[:data_apps].first + expect(data_app_report[:data_app_id]).to eq(data_app3.id) + expect(data_app_report[:data_app_name]).to eq(data_app3.name) + expect(data_app_report[:is_chat_bot]).to eq(true) + expect(data_app_report[:total_chat_messages]).to eq(2) + expect(data_app_report[:total_messages_feedback_responses]).to eq(3) + expect(data_app_report[:slices].size).to be > 0 + time_slice = data_app_report[:slices].last + expect(time_slice[:chat_messages_count]).to eq(2) + expect(time_slice[:message_feedback_count]).to eq(3) + end + end + + context "with data app filtering" do + it "should filter based on the rendering type and return the values" do + context["rendering_type"] = "no_code" + result = subject + expect(result.activity[:data_apps].size).to eq(1) + data_app_report = result.activity[:data_apps].first + expect(data_app_report[:data_app_id]).to eq(data_app4.id) + expect(data_app_report[:data_app_name]).to eq(data_app4.name) + expect(data_app_report[:is_chat_bot]).to eq(false) + end + end + + context "with different time_periods" do + it "calculates the correct start_time for 'one_day'" do + context[:time_period] = "one_day" + result = subject + expect(result).to be_a_success + expect(result.activity[:time_period]).to eq("one_day") + end + + it "calculates the correct start_time for 'thirty_days'" do + context[:time_period] = "thirty_days" + result = subject + expect(result).to be_a_success + expect(result.activity[:time_period]).to eq("thirty_days") + end + end + + context "with custom date ranges" do + include ActiveSupport::Testing::TimeHelpers + + around do |example| + travel_to Time.zone.parse("2025-09-16 12:00:00 UTC") do + example.run + end + end + + context "with custom start_date only" do + let(:start_date) { Date.new(2024, 1, 15) } + let(:context) do + { + time_period: "custom", + start_date:, + end_date: nil, + rendering_type: "embed", + workspace: + } + end + + it "uses custom start_date and calculates end_date as min(start_date + 30.days, now)" do + result = subject + expect(result).to be_a_success + expect(result.activity[:time_period]).to eq("custom") + + # Verify that the data is filtered correctly for the custom range + expect(result.activity[:data_apps]).to be_present + + # Verify asset summary counts are present and correctly structured + data_app_report = result.activity[:data_apps].first + expect(data_app_report).to have_key(:data_app_id) + expect(data_app_report).to have_key(:data_app_name) + expect(data_app_report).to have_key(:is_chat_bot) + expect(data_app_report).to have_key(:slices) + + # Verify counts are present based on app type + if data_app_report[:is_chat_bot] + expect(data_app_report).to have_key(:total_chat_messages) + expect(data_app_report).to have_key(:total_messages_feedback_responses) + expect(data_app_report[:total_chat_messages]).to be >= 0 + expect(data_app_report[:total_messages_feedback_responses]).to be >= 0 + else + expect(data_app_report).to have_key(:total_sessions) + expect(data_app_report).to have_key(:total_feedback_responses) + expect(data_app_report[:total_sessions]).to be >= 0 + expect(data_app_report[:total_feedback_responses]).to be >= 0 + end + end + end + + context "with custom start_date and end_date" do + let(:start_date) { Date.new(2024, 1, 10) } + let(:end_date) { Date.new(2024, 1, 20) } + let(:context) do + { + time_period: "custom", + start_date:, + end_date:, + rendering_type: "embed", + workspace: + } + end + + it "uses exact custom start_date and end_date" do + result = subject + expect(result).to be_a_success + expect(result.activity[:time_period]).to eq("custom") + + # Verify that the data is filtered correctly for the custom range + expect(result.activity[:data_apps]).to be_present + + # Verify asset summary counts are correctly calculated for custom date range + data_app_report = result.activity[:data_apps].first + expect(data_app_report).to have_key(:slices) + + # Verify slices contain proper count structure + if data_app_report[:slices].any? + slice = data_app_report[:slices].first + expect(slice).to have_key(:time_slice) + + if data_app_report[:is_chat_bot] + expect(slice).to have_key(:chat_messages_count) + expect(slice).to have_key(:message_feedback_count) + expect(slice[:chat_messages_count]).to be >= 0 + expect(slice[:message_feedback_count]).to be >= 0 + else + expect(slice).to have_key(:session_count) + expect(slice).to have_key(:feedback_count) + expect(slice[:session_count]).to be >= 0 + expect(slice[:feedback_count]).to be >= 0 + end + end + end + end + + context "with same start_date and end_date" do + let(:date) { Date.new(2024, 1, 15) } + let(:context) do + { + time_period: "custom", + start_date: date, + end_date: date, + rendering_type: "embed", + workspace: + } + end + + it "handles same start_date and end_date correctly" do + result = subject + expect(result).to be_a_success + expect(result.activity[:time_period]).to eq("custom") + + # Verify that the data is filtered correctly for the single day range + expect(result.activity[:data_apps]).to be_present + + # Verify asset summary counts for single day range + data_app_report = result.activity[:data_apps].first + expect(data_app_report).to have_key(:slices) + + # For single day range, verify counts are properly aggregated + if data_app_report[:is_chat_bot] + expect(data_app_report[:total_chat_messages]).to be >= 0 + expect(data_app_report[:total_messages_feedback_responses]).to be >= 0 + else + expect(data_app_report[:total_sessions]).to be >= 0 + expect(data_app_report[:total_feedback_responses]).to be >= 0 + end + end + end + end + end + + describe "asset summary counts with custom date ranges" do + include ActiveSupport::Testing::TimeHelpers + + around do |example| + travel_to Time.zone.parse("2025-09-16 12:00:00 UTC") do + example.run + end + end + + let!(:workspace) { create(:workspace) } + let!(:data_app_visual) { create(:data_app, workspace:) } + let!(:data_app_chat) { create(:data_app, workspace:, visual_components_count: 0) } + let!(:visual_component) { data_app_visual.visual_components.first } + let!(:chat_component) { create(:visual_component, component_type: "chat_bot", data_app: data_app_chat, workspace:) } + + before do + # Create test data within a specific date range + travel_to(Time.zone.parse("2024-01-15 10:00:00 UTC")) + create(:data_app_session, workspace:, data_app: data_app_visual) + chat_session_in_range = create(:data_app_session, workspace:, data_app: data_app_chat) + create(:feedback, workspace:, data_app: data_app_visual, visual_component:) + create(:chat_message, visual_component: chat_component, role: 0, content: "Test message", + session: chat_session_in_range, workspace:) + create(:chat_message, visual_component: chat_component, role: 1, content: "Test response", + session: chat_session_in_range, workspace:) + create(:message_feedback, visual_component: chat_component, workspace:, data_app: data_app_chat) + + # Create test data outside the date range + travel_to(Time.zone.parse("2024-02-15 10:00:00 UTC")) + create(:data_app_session, workspace:, data_app: data_app_visual) + chat_session_out_of_range = create(:data_app_session, workspace:, data_app: data_app_chat) + create(:feedback, workspace:, data_app: data_app_visual, visual_component:) + create(:chat_message, visual_component: chat_component, role: 0, content: "Outside range message", + session: chat_session_out_of_range, workspace:) + create(:message_feedback, visual_component: chat_component, workspace:, data_app: data_app_chat) + + # Reset to current time for test execution + travel_back + end + + context "with custom date range filtering" do + let(:start_date) { Date.new(2024, 1, 10) } + let(:end_date) { Date.new(2024, 1, 20) } + let(:context) do + { + time_period: "custom", + start_date:, + end_date:, + workspace: + } + end + + # TODO: Fix this test + xit "correctly filters asset counts by custom date range" do + result = described_class.call(context) + expect(result).to be_a_success + + # Find the visual app report + visual_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_visual.id } + expect(visual_app_report).to be_present + expect(visual_app_report[:is_chat_bot]).to eq(false) + # Counter cache shows total sessions (both Jan 15 and Feb 15) + expect(visual_app_report[:total_sessions]).to eq(2) + # Counter cache shows total feedbacks (both Jan 15 and Feb 15) + expect(visual_app_report[:total_feedback_responses]).to eq(2) + + # Find the chat app report (it will be detected as a regular visual app since chat component is not first) + chat_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_chat.id } + expect(chat_app_report).to be_present + expect(chat_app_report[:is_chat_bot]).to eq(false) # Chat component is not the first visual component + expect(chat_app_report[:total_sessions]).to eq(0) # No sessions created for chat app + expect(chat_app_report[:total_feedback_responses]).to eq(0) # No feedbacks created for chat app + end + + # TODO: Fix this test + xit "correctly filters slice counts by custom date range" do + result = described_class.call(context) + expect(result).to be_a_success + + # Check visual app slices + visual_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_visual.id } + expect(visual_app_report[:slices]).to be_present + + # Find the slice for the test date (2024-01-15) + test_date_slice = visual_app_report[:slices].find do |slice| + slice[:time_slice].to_date == Date.new(2024, 1, 15) + end + expect(test_date_slice).to be_present + expect(test_date_slice[:session_count]).to eq(1) + expect(test_date_slice[:feedback_count]).to eq(1) + + # Check chat app slices (it will be detected as a regular visual app) + chat_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_chat.id } + expect(chat_app_report[:slices]).to be_present + + # Since no sessions were created for the chat app, all slices should have 0 counts + chat_app_report[:slices].each do |slice| + expect(slice[:session_count]).to eq(0) + expect(slice[:feedback_count]).to eq(0) + end + end + end + + context "with custom start_date only (fallback end_date)" do + let(:start_date) { Date.new(2024, 1, 10) } + let(:context) do + { + time_period: "custom", + start_date:, + end_date: nil, + workspace: + } + end + + it "includes data within the calculated date range" do + result = described_class.call(context) + expect(result).to be_a_success + + # Counter cache shows total counts regardless of date range + visual_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_visual.id } + expect(visual_app_report[:total_sessions]).to eq(2) # Counter cache shows total sessions + expect(visual_app_report[:total_feedback_responses]).to eq(2) # Counter cache shows total feedbacks + + chat_app_report = result.activity[:data_apps].find { |app| app[:data_app_id] == data_app_chat.id } + expect(chat_app_report).to be_present + expect(chat_app_report[:is_chat_bot]).to eq(true) + # 3 chat messages total (2 in range + 1 outside); total_chat_messages = count / 2 + expect(chat_app_report[:total_chat_messages]).to eq(1) + # 1 in range + 1 outside + expect(chat_app_report[:total_messages_feedback_responses]).to eq(2) + end + end + end + + describe "inherited BaseActivityReport methods" do + include ActiveSupport::Testing::TimeHelpers + + around do |example| + travel_to Time.zone.parse("2025-09-16 12:00:00 UTC") do + example.run + end + end + + let(:interactor) { described_class.new(context) } + + describe "#filter_params" do + context "with custom start_date only" do + let(:start_date) { Date.new(2024, 1, 15) } + let(:context) do + OpenStruct.new( + time_period: "custom", + start_date:, + end_date: nil, + workspace: + ) + end + + it "returns correct filter params with calculated end_date" do + params = interactor.filter_params + + expect(params[:time_period]).to eq("custom") + expect(params[:start_time]).to eq(start_date.to_time.beginning_of_day.in_time_zone("UTC")) + expected_end = [start_date.to_time.beginning_of_day.in_time_zone("UTC") + 30.days, Time.zone.now].min + expect(params[:end_time]).to eq(expected_end) + expect(params[:created_at]).to eq(params[:start_time]..params[:end_time]) + expect(params[:range]).to eq(params[:start_time]..params[:end_time]) + end + end + + context "with custom start_date and end_date" do + let(:start_date) { Date.new(2024, 1, 10) } + let(:end_date) { Date.new(2024, 1, 20) } + let(:context) do + OpenStruct.new( + time_period: "custom", + start_date:, + end_date:, + workspace: + ) + end + + it "returns correct filter params with exact dates" do + params = interactor.filter_params + + expect(params[:time_period]).to eq("custom") + expect(params[:start_time]).to eq(start_date.to_time.beginning_of_day.in_time_zone("UTC")) + expect(params[:end_time]).to eq(end_date.to_time.end_of_day.in_time_zone("UTC")) + expect(params[:created_at]).to eq(params[:start_time]..params[:end_time]) + expect(params[:range]).to eq(params[:start_time]..params[:end_time]) + end + end + + context "with predefined time period" do + let(:context) do + OpenStruct.new( + time_period: "one_week", + start_date: nil, + end_date: nil, + workspace: + ) + end + + it "returns correct filter params for predefined period" do + params = interactor.filter_params + + expect(params[:time_period]).to eq("one_week") + expect(params[:start_time]).to eq(6.days.ago.beginning_of_day.in_time_zone("UTC")) + expect(params[:end_time]).to eq(Time.zone.now) + expect(params[:created_at]).to eq(params[:start_time]..params[:end_time]) + expect(params[:range]).to eq(params[:start_time]..params[:end_time]) + end + end + end + + describe "#resolve_time_range" do + context "with custom dates" do + let(:start_date) { Date.new(2024, 1, 15) } + let(:end_date) { Date.new(2024, 1, 25) } + let(:context) do + OpenStruct.new( + start_date:, + end_date:, + time_period: "custom", + workspace: + ) + end + + it "returns custom time range" do + start_time, end_time = interactor.resolve_time_range + + expect(start_time).to eq(start_date.to_time.beginning_of_day.in_time_zone("UTC")) + expect(end_time).to eq(end_date.to_time.end_of_day.in_time_zone("UTC")) + end + end + + context "with predefined period" do + let(:context) do + OpenStruct.new( + time_period: "one_day", + start_date: nil, + end_date: nil, + workspace: + ) + end + + it "returns predefined time range" do + start_time, end_time = interactor.resolve_time_range + + expect(start_time).to eq(1.day.ago.beginning_of_day.in_time_zone("UTC")) + expect(end_time).to eq(Time.zone.now) + end + end + end + + describe "#calculate_predefined_range" do + let(:context) { OpenStruct.new(workspace:) } + let(:interactor) { described_class.new(context) } + + it "returns correct ranges for all predefined periods" do + periods = { one_day: 1, one_week: 6, thirty_days: 29 } + + periods.each do |period, days_ago| + start_time, end_time = interactor.calculate_predefined_range(period) + expect(start_time).to eq(days_ago.days.ago.beginning_of_day.in_time_zone("UTC")) + expect(end_time).to eq(Time.zone.now) + end + end + end + end +end diff --git a/server/spec/enterprise/interactors/visual_components/create_visual_component_spec.rb b/server/spec/enterprise/interactors/visual_components/create_visual_component_spec.rb new file mode 100644 index 000000000..6e71629c2 --- /dev/null +++ b/server/spec/enterprise/interactors/visual_components/create_visual_component_spec.rb @@ -0,0 +1,100 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe VisualComponents::CreateVisualComponent, type: :interactor do + describe ".call" do + let(:workspace) { create(:workspace) } + let(:data_app) { create(:data_app, workspace:, visual_components_count: 0) } + let!(:ai_ml_connector) { create(:connector, workspace:, connector_category: "AI Model") } + let!(:ai_ml_model) do + create(:model, query_type: :ai_ml, connector: ai_ml_connector, configuration: { harvesters: [] }, workspace:) + end + let!(:workflow) { create(:workflow, workspace:) } + let(:valid_visual_components_params) do + [ + { component_type: "doughnut", name: "Sales Pie Chart", configurable_id: ai_ml_model.id.to_s, + configurable_type: "Model", properties: { color: "blue" } } + ] + end + + let(:valid_workflow_visual_components_params) do + [ + { component_type: "chat_bot", name: "Workflow Chat Bot", configurable_id: workflow.id.to_s, + configurable_type: "Agents::Workflow", properties: { theme: "dark" } } + ] + end + + let(:invalid_visual_components_params) do + [ + { component_type: "doughnut", name: "Invalid Component", configurable_id: 0, configurable_type: "Model" } + ] + end + + let(:invalid_workflow_visual_components_params) do + [ + { component_type: "chat_bot", name: "Invalid Workflow Component", configurable_id: "invalid-uuid", + configurable_type: "Agents::Workflow" } + ] + end + + context "when all visual components are created successfully with model configurable" do + it "creates visual components and sets them in the context" do + result = described_class.call( + visual_components_params: valid_visual_components_params, + data_app:, + workspace: + ) + expect(result).to be_a_success + expect(data_app.visual_components.count).to eq(1) + expect(data_app.visual_components.map(&:name)).to include("Sales Pie Chart") + expect(data_app.visual_components.first.model).to eq(ai_ml_model) + expect(data_app.visual_components.first.workflow).to be_nil + end + end + + context "when all visual components are created successfully with workflow configurable" do + it "creates visual components and sets them in the context" do + result = described_class.call( + visual_components_params: valid_workflow_visual_components_params, + data_app:, + workspace: + ) + expect(result).to be_a_success + expect(data_app.visual_components.count).to eq(1) + expect(data_app.visual_components.map(&:name)).to include("Workflow Chat Bot") + expect(data_app.visual_components.first.workflow).to eq(workflow) + expect(data_app.visual_components.first.model).to be_nil + expect(data_app.visual_components.first.component_type).to eq("chat_bot") + end + end + + context "when some visual components fail to be created with model configurable" do + it "fails and sets the error message in the context" do + result = described_class.call( + visual_components_params: invalid_visual_components_params, + data_app:, + workspace: + ) + + expect(result).to be_a_failure + expect(result.message).to include("Configurable must exist") + expect(data_app.visual_components.count).to eq(0) + end + end + + context "when some visual components fail to be created with workflow configurable" do + it "fails and sets the error message in the context" do + result = described_class.call( + visual_components_params: invalid_workflow_visual_components_params, + data_app:, + workspace: + ) + + expect(result).to be_a_failure + expect(result.message).to include("Configurable must exist") + expect(data_app.visual_components.count).to eq(0) + end + end + end +end diff --git a/server/spec/lib/reverse_etl/extractors/web_scraping_spec.rb b/server/spec/lib/reverse_etl/extractors/web_scraping_spec.rb index 70d3110fc..ac9ee7a02 100644 --- a/server/spec/lib/reverse_etl/extractors/web_scraping_spec.rb +++ b/server/spec/lib/reverse_etl/extractors/web_scraping_spec.rb @@ -6,7 +6,24 @@ let(:source) { create(:connector, connector_type: "source", connector_name: "Snowflake") } let(:destination) { create(:connector, connector_type: "destination") } let!(:catalog) { create(:catalog, connector: destination) } - let(:sync) { create(:sync, source:, destination:) } + let(:standard_configuration) do + [{ "mapping_type" => "standard", "from" => "markdown", "to" => "markdown" }] + end + let(:vector_configuration) do + [ + { + "mapping_type" => "vector", + "from" => "markdown", + "to" => "embedding", + "embedding_config" => { + "model" => "text-embedding-ada-002", + "mode" => "open_ai", + "provider" => "open_ai" + } + } + ] + end + let(:sync) { create(:sync, source:, destination:, configuration: standard_configuration) } let(:sync_run1) do create(:sync_run, sync:, workspace: sync.workspace, source:, destination:, model: sync.model, status: "started") end @@ -109,5 +126,166 @@ expect(sync_run_pending).to have_state(:pending) end end + + context "when vector mapping is present" do + let(:sync) { create(:sync, source:, destination:, configuration: vector_configuration) } + let(:sync_run_vector) do + create(:sync_run, sync:, workspace: sync.workspace, source:, destination:, model: sync.model, status: "started") + end + + before do + sync.model.update(primary_key: "markdown_hash", query: "SELECT * FROM web_scraping_data") + end + + it "uses token chunking and creates sync records" do + expect(chunk_processor).to receive(:process) + .with({ model: "text-embedding-ada-002", provider: "open_ai", chunk_size: 8191 }, "Some content") + .and_return(chunked_records) + expect { subject.read(sync_run_vector.id, activity) }.to change { sync_run_vector.sync_records.count }.by(1) + sync_run_vector.reload + expect(sync_run_vector.total_query_rows).to eq(1) + expect(sync_run_vector.skipped_rows).to eq(0) + end + end + end + + describe "#generate_chunks" do + subject { described_class.new } + + context "when no vector mapping is present" do + it "calls process on chunk_processor with default chunk config" do + expect(chunk_processor).to receive(:process) + .with({ chunk_size: 1000, chunk_overlap: 200 }, "Some content") + .and_return(chunked_records) + result = subject.send(:generate_chunks, sync_run1, "Some content") + expect(result).to eq(chunked_records) + end + end + + context "when a vector mapping is present" do + let(:sync) { create(:sync, source:, destination:, configuration: vector_configuration) } + let(:sync_run_vector) do + create(:sync_run, sync:, workspace: sync.workspace, source:, destination:, model: sync.model, status: "started") + end + + before do + sync.model.update(primary_key: "markdown_hash", query: "SELECT * FROM web_scraping_data") + end + + it "calls ChunkProcessor#process with model, provider, and chunk_size from embedding_config" do + expect(chunk_processor).to receive(:process) + .with({ model: "text-embedding-ada-002", provider: "open_ai", chunk_size: 8191 }, "Some content") + .and_return(chunked_records) + result = subject.send(:generate_chunks, sync_run_vector, "Some content") + expect(result).to eq(chunked_records) + end + end + + context "when multiple vector mappings are present" do + let(:multi_vector_configuration) do + [ + { + "mapping_type" => "vector", + "from" => "markdown", + "to" => "embedding_1", + "embedding_config" => { "model" => "text-embedding-ada-002", "mode" => "open_ai" } + }, + { + "mapping_type" => "vector", + "from" => "markdown", + "to" => "embedding_2", + "embedding_config" => { "model" => "all-MiniLM-L6-v2", "mode" => "hugging_face" } + } + ] + end + let(:sync) { create(:sync, source:, destination:, configuration: multi_vector_configuration) } + let(:sync_run_multi) do + create(:sync_run, sync:, workspace: sync.workspace, source:, destination:, model: sync.model, status: "started") + end + + before do + sync.model.update(primary_key: "markdown_hash", query: "SELECT * FROM web_scraping_data") + end + + it "uses the model with the smallest token limit" do + expect(chunk_processor).to receive(:process) + .with({ model: "all-MiniLM-L6-v2", provider: "hugging_face", chunk_size: 256 }, "Some content") + .and_return(chunked_records) + result = subject.send(:generate_chunks, sync_run_multi, "Some content") + expect(result).to eq(chunked_records) + end + end + + context "when chunk processing raises a StandardError" do + before do + allow(chunk_processor).to receive(:process).and_raise(StandardError, "processing failed") + end + + it "raises ChunkProcessingError" do + expect { subject.send(:generate_chunks, sync_run1, "Some content") } + .to raise_error(ReverseEtl::Extractors::ChunkProcessingError, /processing failed/) + end + end + end + + describe "#fetch_records" do + subject { described_class.new } + + context "when source returns nil" do + before do + allow(client).to receive(:read).and_return(nil) + end + + it "raises a RuntimeError" do + expect { subject.send(:fetch_records, sync_run1) } + .to raise_error(RuntimeError, /Expected record in the result/) + end + end + + context "when source returns a non-array" do + before do + allow(client).to receive(:read).and_return("unexpected string") + end + + it "raises a RuntimeError" do + expect { subject.send(:fetch_records, sync_run1) } + .to raise_error(RuntimeError, /Expected record in the result/) + end + end + + context "when source returns a valid array" do + it "returns the result" do + result = subject.send(:fetch_records, sync_run1) + expect(result).to eq([record1]) + end + end + end + + describe "#build_record" do + subject { described_class.new } + + let(:message) { { "text" => "Hello world", "element_id" => "abc123" } } + let(:metadata) { "{\"url\": \"https://example.com\", \"source\": \"test\"}" } + + it "maps text to markdown and element_id to markdown_hash" do + record = subject.send(:build_record, message, metadata) + expect(record.data[:markdown]).to eq("Hello world") + expect(record.data[:markdown_hash]).to eq("abc123") + end + + it "parses the url from metadata JSON" do + record = subject.send(:build_record, message, metadata) + expect(record.data[:url]).to eq("https://example.com") + end + + it "stores the raw metadata string" do + record = subject.send(:build_record, message, metadata) + expect(record.data[:metadata]).to eq(metadata) + end + + it "returns a RecordMessage" do + record = subject.send(:build_record, message, metadata) + expect(record).to be_a(Multiwoven::Integrations::Protocol::RecordMessage) + end end end diff --git a/server/spec/lib/reverse_etl/processors/text/chunk_processor_spec.rb b/server/spec/lib/reverse_etl/processors/text/chunk_processor_spec.rb new file mode 100644 index 000000000..85fa0cada --- /dev/null +++ b/server/spec/lib/reverse_etl/processors/text/chunk_processor_spec.rb @@ -0,0 +1,172 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe ReverseEtl::Processors::Text::ChunkProcessor do + let(:processor) { described_class.new } + let(:chunk_config) do + { + chunk_size: 1000, + chunk_overlap: 200 + } + end + let(:content) { "This is a test content" } + let(:metadata) do + { + file_name: "test.pdf", + file_path: "/tmp/test.pdf", + file_type: "PDF", + size: 123, + file_created_date: Time.current.iso8601, + file_modified_date: Time.current.iso8601 + } + end + let(:chunks) { %w[chunk1 chunk2] } + let(:mock_chunk_processor) { instance_double(ReverseEtl::Processors::Text::LangchainRb) } + + before do + allow(ReverseEtl::Processors::Text::LangchainRb).to receive(:new).and_return(mock_chunk_processor) + allow(mock_chunk_processor).to receive(:process).and_return(chunks) + end + + describe "#process" do + context "processor selection (legacy config — no model/provider)" do + context "when CHUNK_PROCESSOR is not set" do + before { ENV["CHUNK_PROCESSOR"] = nil } + + it "uses LangchainRb as default processor" do + expect(ReverseEtl::Processors::Text::LangchainRb).to receive(:new).and_return(mock_chunk_processor) + processor.process(chunk_config, content, metadata) + end + end + + context "when CHUNK_PROCESSOR is set to langchain_rb" do + before { ENV["CHUNK_PROCESSOR"] = "langchain_rb" } + after { ENV["CHUNK_PROCESSOR"] = nil } + + it "uses the specified processor" do + expect(ReverseEtl::Processors::Text::LangchainRb).to receive(:new).and_return(mock_chunk_processor) + processor.process(chunk_config, content, metadata) + end + end + + context "when CHUNK_PROCESSOR refers to a non-existent class" do + before { ENV["CHUNK_PROCESSOR"] = "non_existent_processor" } + after { ENV["CHUNK_PROCESSOR"] = nil } + + it "raises NameError" do + expect { processor.process(chunk_config, content, metadata) }.to raise_error(NameError) + end + end + end + + context "processor selection (non-legacy config — has model/provider/chunk_size)" do + let(:mock_token_chunker) { instance_double(ReverseEtl::Processors::Text::TokenChunker) } + let(:non_legacy_config) { { model: "text-embedding-ada-002", provider: "open_ai", chunk_size: 8191 } } + + before do + allow(ReverseEtl::Processors::Text::TokenChunker).to receive(:new).and_return(mock_token_chunker) + allow(mock_token_chunker).to receive(:process).and_return(chunks) + end + + it "uses TokenChunker" do + expect(ReverseEtl::Processors::Text::TokenChunker).to receive(:new).and_return(mock_token_chunker) + processor.process(non_legacy_config, content, metadata) + end + end + + it "processes content into chunks with metadata" do + result = processor.process(chunk_config, content, metadata) + expect(result).to be_an(Array) + expect(result.length).to eq(2) + expect(result.first).to include( + element_id: Digest::MD5.hexdigest("chunk1"), + text: "chunk1", + created_date: metadata[:file_created_date], + modified_date: metadata[:file_modified_date], + filename: metadata[:file_name], + filetype: metadata[:file_type], + created_at: kind_of(Time) + ) + end + + it "uses default chunk size and overlap if not provided" do + processor.process({}, content, metadata) + expect(mock_chunk_processor).to have_received(:process).with( + { chunk_size: 1000, chunk_overlap: 200 }, + content + ) + end + + context "when processing raises an error" do + before do + allow(mock_chunk_processor).to receive(:process).and_raise(StandardError, "Processing failed") + end + + it "raises ChunkProcessingError" do + expect do + processor.process(chunk_config, content, metadata) + end.to raise_error(StandardError, + "Processing failed") + end + end + end + + describe "#legacy?" do + it "returns true when model and provider are both blank" do + expect(processor.send(:legacy?, {})).to be true + end + + it "returns true when only chunk_size is present (no model/provider)" do + expect(processor.send(:legacy?, { chunk_size: 8191 })).to be true + end + + it "returns false when model is present" do + expect(processor.send(:legacy?, { model: "text-embedding-ada-002" })).to be false + end + + it "returns false when provider is present" do + expect(processor.send(:legacy?, { provider: "open_ai" })).to be false + end + + it "returns false when both model and provider are present" do + expect(processor.send(:legacy?, + { model: "text-embedding-ada-002", provider: "open_ai", chunk_size: 8191 })).to be false + end + end + + describe "#format_chunks" do + it "formats chunks with metadata" do + result = processor.send(:format_chunks, chunks, metadata) + expect(result).to be_an(Array) + expect(result.length).to eq(2) + expect(result.first).to include( + element_id: Digest::MD5.hexdigest("chunk1"), + text: "chunk1", + created_date: metadata[:file_created_date], + modified_date: metadata[:file_modified_date], + filename: metadata[:file_name], + filetype: metadata[:file_type], + created_at: kind_of(Time) + ) + end + + it "generates unique element_id for each chunk" do + result = processor.send(:format_chunks, chunks, metadata) + expect(result.first[:element_id]).not_to eq(result.last[:element_id]) + end + + it "handles missing metadata gracefully" do + result = processor.send(:format_chunks, chunks, {}) + expect(result.first).to include( + element_id: Digest::MD5.hexdigest("chunk1"), + text: "chunk1", + created_at: kind_of(Time) + ) + expect(result.first[:created_date]).to be_nil + expect(result.first[:modified_date]).to be_nil + expect(result.first[:file_name]).to be_nil + expect(result.first[:file_type]).to be_nil + end + end +end diff --git a/server/spec/lib/reverse_etl/processors/text/token_chunker_spec.rb b/server/spec/lib/reverse_etl/processors/text/token_chunker_spec.rb new file mode 100644 index 000000000..b565375ae --- /dev/null +++ b/server/spec/lib/reverse_etl/processors/text/token_chunker_spec.rb @@ -0,0 +1,176 @@ +# frozen_string_literal: true + +require "rails_helper" + +RSpec.describe ReverseEtl::Processors::Text::TokenChunker do + let(:processor) { described_class.new } + let(:content) { "This is a test content for token chunking" } + let(:metadata) do + { + file_name: "test.pdf", + file_type: "PDF", + file_created_date: Time.current.iso8601, + file_modified_date: Time.current.iso8601 + } + end + let(:mock_tokeniser) { double("tokeniser") } + + describe "#process" do + context "with invalid config" do + it "raises TypeError when model is blank" do + expect { processor.process({ model: nil, provider: "open_ai" }, content) } + .to raise_error(TypeError, "Model is required") + end + + it "raises TypeError when model is unsupported" do + expect { processor.process({ model: "unknown-model", provider: "open_ai" }, content) } + .to raise_error(TypeError, /not supported/) + end + + it "raises TypeError when provider is blank" do + expect { processor.process({ model: "text-embedding-ada-002", provider: nil }, content) } + .to raise_error(TypeError, "Provider is required") + end + + it "raises TypeError when provider is unsupported" do + allow(Tiktoken).to receive(:encoding_for_model).and_return(mock_tokeniser) + expect { processor.process({ model: "text-embedding-ada-002", provider: "unknown_provider" }, content) } + .to raise_error(TypeError, /not supported/) + end + end + + context "with open_ai provider" do + let(:model) { "text-embedding-ada-002" } + let(:chunk_config) { { model:, provider: "open_ai", chunk_size: 8191 } } + + before do + allow(Tiktoken).to receive(:encoding_for_model).with(model).and_return(mock_tokeniser) + end + + context "when text is within the token limit" do + before do + allow(mock_tokeniser).to receive(:encode).and_return(Array.new(100, 1)) + allow(mock_tokeniser).to receive(:decode).and_return("decoded text") + end + + it "returns a single chunk as a string" do + result = processor.process(chunk_config, content) + expect(result).to be_an(Array) + expect(result.length).to eq(1) + expect(result.first).to be_a(String) + end + end + + context "when text exceeds the token limit" do + before do + allow(mock_tokeniser).to receive(:encode).and_return(Array.new(9000, 1)) + allow(mock_tokeniser).to receive(:decode).and_return("decoded chunk text") + end + + it "splits text into multiple string chunks" do + result = processor.process(chunk_config, content) + expect(result).to be_an(Array) + expect(result.length).to be > 1 + expect(result.first).to be_a(String) + end + end + end + + context "with hugging_face provider" do + let(:model) { "all-MiniLM-L6-v2" } + let(:chunk_config) { { model:, provider: "hugging_face", chunk_size: 256 } } + let(:mock_encode_result) { double("encode_result", tokens: Array.new(100, "token")) } + + before do + allow(Tokenizers::Tokenizer).to receive(:from_pretrained) + .with("sentence-transformers/#{model}") + .and_return(mock_tokeniser) + allow(mock_tokeniser).to receive(:encode).and_return(mock_encode_result) + allow(mock_tokeniser).to receive(:decode).and_return(%w[decoded chunk]) + end + + context "when text is within the token limit" do + it "returns a single string chunk" do + result = processor.process(chunk_config, content) + expect(result).to be_an(Array) + expect(result.length).to eq(1) + expect(result.first).to be_a(String) + end + end + + context "when text exceeds the token limit" do + let(:mock_encode_result) { double("encode_result", tokens: Array.new(300, "token")) } + + it "splits text into multiple string chunks" do + result = processor.process(chunk_config, content) + expect(result).to be_an(Array) + expect(result.length).to be > 1 + expect(result.first).to be_a(String) + end + end + end + end + + describe "#get_tokens" do + context "with open_ai provider" do + before do + allow(Tiktoken).to receive(:encoding_for_model).with("text-embedding-ada-002").and_return(mock_tokeniser) + allow(mock_tokeniser).to receive(:encode).and_return([1, 2, 3]) + processor.instance_variable_set(:@model, "text-embedding-ada-002") + processor.instance_variable_set(:@provider, "open_ai") + processor.instance_variable_set(:@tokeniser, mock_tokeniser) + end + + it "calls encode and returns the token array directly" do + expect(processor.send(:get_tokens, "hello")).to eq([1, 2, 3]) + end + end + + context "with hugging_face provider" do + let(:encode_result) { double("encode_result", tokens: %w[tok1 tok2]) } + + before do + allow(mock_tokeniser).to receive(:encode).and_return(encode_result) + processor.instance_variable_set(:@model, "all-MiniLM-L6-v2") + processor.instance_variable_set(:@provider, "hugging_face") + processor.instance_variable_set(:@tokeniser, mock_tokeniser) + end + + it "calls encode and returns the .tokens array" do + expect(processor.send(:get_tokens, "hello")).to eq(%w[tok1 tok2]) + end + end + end + + describe "#tokens_to_text_chunks" do + context "with open_ai provider" do + before do + allow(mock_tokeniser).to receive(:decode).and_return("decoded text") + processor.instance_variable_set(:@provider, "open_ai") + processor.instance_variable_set(:@tokeniser, mock_tokeniser) + end + + it "decodes each token slice and returns strings" do + tokens = Array.new(20, 1) + result = processor.send(:tokens_to_text_chunks, tokens, 10) + expect(result.length).to eq(2) + expect(result.first).to eq("decoded text") + end + end + + context "with hugging_face provider" do + before do + allow(mock_tokeniser).to receive(:decode).and_return(%w[word1 word2]) + processor.instance_variable_set(:@provider, "hugging_face") + processor.instance_variable_set(:@tokeniser, mock_tokeniser) + end + + it "decodes and joins tokens with a space" do + tokens = Array.new(20, "tok") + result = processor.send(:tokens_to_text_chunks, tokens, 10) + expect(result.length).to eq(2) + expect(result.first).to eq("word1 word2") + end + end + end +end