diff --git a/ROADMAP.md b/ROADMAP.md index 7cb0745..917a1af 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -190,6 +190,7 @@ In Progress - [x] Add broader fixture-corpus pressure for near-miss all-term ranking and longer-body snippet selection across the default in-memory path and the macOS SearchKit-backed path. - [x] Refine the default in-memory all-term ranker so tighter evidence beats scattered term mentions instead of falling through to document ID tie-breaking. - [x] Add a second checked-in text source for corpus-based tests so fixture coverage is not only Gutenberg-derived. +- [x] Add a Hugging Face-derived audit micro-corpus that combines short stories, markdown reference records, and line-oriented literary text across the default in-memory and macOS SearchKit-backed paths. - [ ] Audit larger app-like corpus result quality now that field-aware ranking, compact all-term evidence, phrase weighting, truncation cues, multi-term snippets, and field-evidence metadata are in place. - [ ] Keep the persistent `FetchKitLibrary` construction and search API surface under review as real callers exercise the current design. - [ ] Explore an opt-in extended snippet surface that can use idle time to precompute short document summaries for larger records, with Apple's [`FoundationModels`](https://developer.apple.com/documentation/foundationmodels) or another local summarization path as the first candidate instead of making foreground full-text search wait on summarization. diff --git a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift index 3fb83e5..b855d42 100644 --- a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift +++ b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift @@ -17,6 +17,11 @@ struct FixtureCorpusQualityTests { #expect(TinyStoriesMiniCorpus.source.split == "train") #expect(TinyStoriesMiniCorpus.records.allSatisfy { $0.sourceURI == TinyStoriesMiniCorpus.source.url }) #expect(TinyStoriesMiniCorpus.records.allSatisfy { $0.metadata["fixture.dataset"] == TinyStoriesMiniCorpus.source.datasetID }) + #expect(HuggingFaceAuditCorpus.records.count == 10) + #expect(HuggingFaceAuditCorpus.tinyStoriesRecords.allSatisfy { $0.sourceURI == HuggingFaceAuditCorpus.tinyStoriesSource.url }) + #expect(HuggingFaceAuditCorpus.simpleWikipediaRecords.allSatisfy { $0.sourceURI == HuggingFaceAuditCorpus.simpleWikipediaSource.url }) + #expect(HuggingFaceAuditCorpus.gutenbergPoetryRecords.allSatisfy { $0.sourceURI == HuggingFaceAuditCorpus.gutenbergPoetrySource.url }) + #expect(HuggingFaceAuditCorpus.records.allSatisfy { $0.metadata["fixture.dataset"] != nil }) } @Test("Fixture corpus retrieves a body-driven chapter hit") @@ -146,9 +151,83 @@ struct FixtureCorpusQualityTests { #expect(fuelResults.first?.matchedFields.contains(.body) == true) } + @Test("Hugging Face audit corpus retrieves distinct content families") + func huggingFaceAuditCorpusRetrievesDistinctContentFamilies() async throws { + let library = try await indexedAuditLibrary() + + let calendarResults = try await library.search( + "april leap year flowers", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + let rhetoricResults = try await library.search( + "against person weak argument", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + let storyResults = try await library.search( + "triangle puddle troubled toy", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + let poetryResults = try await library.search( + "great lakes northland ojibways", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + + #expect(calendarResults.first?.document.id == "hf-simplewiki-april") + #expect(calendarResults.first?.snippet?.text.localizedCaseInsensitiveContains("leap years") == true) + #expect(rhetoricResults.first?.document.id == "hf-simplewiki-ad-hominem") + #expect(rhetoricResults.first?.snippet?.text.localizedCaseInsensitiveContains("weak") == true) + #expect(rhetoricResults.first?.snippet?.text.localizedCaseInsensitiveContains("argument") == true) + #expect(storyResults.first?.document.id == "hf-tinystories-row-6-triangle") + #expect(storyResults.first?.snippet?.text.localizedCaseInsensitiveContains("puddle") == true) + #expect(poetryResults.first?.document.id == "hf-poetry-hiawatha-northland") + #expect(poetryResults.first?.snippet?.text.localizedCaseInsensitiveContains("great lakes") == true) + } + + @Test("Hugging Face audit corpus exposes useful snippet fields") + func huggingFaceAuditCorpusExposesUsefulSnippetFields() async throws { + let library = try await indexedAuditLibrary() + + let titleResults = try await library.search( + "angel", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + let bodyResults = try await library.search( + "cobweb spider castle", + kind: .allTerms, + fields: [.title, .body], + limit: 4 + ) + + #expect(titleResults.first?.document.id == "hf-simplewiki-angels") + #expect(titleResults.first?.matchedFields.contains(.title) == true) + #expect(bodyResults.first?.document.id == "hf-tinystories-row-4-cobweb") + #expect(bodyResults.first?.matchedFields.contains(.body) == true) + #expect(bodyResults.first?.snippetField == .body) + } + private func indexedFixtureLibrary() async throws -> FetchKitLibrary { let library = FetchKitLibrary() try await library.addDocuments(GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records) return library } + + private func indexedAuditLibrary() async throws -> FetchKitLibrary { + let library = FetchKitLibrary() + try await library.addDocuments( + GutenbergMiniCorpus.records + + TinyStoriesMiniCorpus.records + + HuggingFaceAuditCorpus.records + ) + return library + } } diff --git a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift index 92b5957..ecd2156 100644 --- a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift +++ b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift @@ -7,7 +7,7 @@ import XCTest final class SearchKitFetchIndexTests: XCTestCase { private var fixtureRecords: [FetchDocumentRecord] { - GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records + GutenbergMiniCorpus.records + TinyStoriesMiniCorpus.records + HuggingFaceAuditCorpus.records } func testSearchKitFetchIndexIndexesAndSearchesText() async throws { @@ -302,6 +302,35 @@ final class SearchKitFetchIndexTests: XCTestCase { XCTAssertEqual(needleResult.snippet?.text.localizedCaseInsensitiveContains("needle"), true) } + func testSearchKitFetchIndexMatchesHuggingFaceAuditCorpusSources() async throws { + let index = try SearchKitFetchIndex( + configuration: .init( + storage: .inMemory, + indexNamePrefix: "SearchKitFetchIndexTests-\(UUID().uuidString)" + ) + ) + + try await index.apply( + FetchIndexingChangeset( + fixtureRecords.map { .upsert($0.indexDocument) } + ) + ) + + let rhetoricResults = try await index.search( + FetchSearchQuery("against person weak argument", kind: .allTerms, fields: [.title, .body], limit: 4) + ) + let storyResults = try await index.search( + FetchSearchQuery("triangle puddle troubled toy", kind: .allTerms, fields: [.title, .body], limit: 4) + ) + let poetryResults = try await index.search( + FetchSearchQuery("great lakes northland ojibways", kind: .allTerms, fields: [.title, .body], limit: 4) + ) + + XCTAssertEqual(rhetoricResults.first?.document.id, "hf-simplewiki-ad-hominem") + XCTAssertEqual(storyResults.first?.document.id, "hf-tinystories-row-6-triangle") + XCTAssertEqual(poetryResults.first?.document.id, "hf-poetry-hiawatha-northland") + } + func testFetchKitLibraryBuildsPersistentPair() async throws { let temporaryDirectory = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true) .appendingPathComponent(UUID().uuidString, isDirectory: true) diff --git a/Tests/SwiftlyFetchTestFixtures/HuggingFaceAuditCorpus.swift b/Tests/SwiftlyFetchTestFixtures/HuggingFaceAuditCorpus.swift new file mode 100644 index 0000000..fda6024 --- /dev/null +++ b/Tests/SwiftlyFetchTestFixtures/HuggingFaceAuditCorpus.swift @@ -0,0 +1,206 @@ +import FetchCore + +public enum HuggingFaceAuditCorpus { + public static let tinyStoriesSource = GutenbergMiniCorpus.Source( + datasetID: "roneneldan/TinyStories", + config: "default", + split: "train", + license: "CDLA-Sharing-1.0; synthetic short stories generated by GPT-3.5 and GPT-4", + url: "https://huggingface.co/datasets/roneneldan/TinyStories" + ) + + public static let simpleWikipediaSource = GutenbergMiniCorpus.Source( + datasetID: "juno-labs/simple_wikipedia", + config: "default", + split: "train", + license: "CC-BY-SA-3.0", + url: "https://huggingface.co/datasets/juno-labs/simple_wikipedia" + ) + + public static let gutenbergPoetrySource = GutenbergMiniCorpus.Source( + datasetID: "biglam/gutenberg-poetry-corpus", + config: "default", + split: "train", + license: "CC0-1.0", + url: "https://huggingface.co/datasets/biglam/gutenberg-poetry-corpus" + ) + + public static let records: [FetchDocumentRecord] = tinyStoriesRecords + + simpleWikipediaRecords + + gutenbergPoetryRecords + + public static let tinyStoriesRecords: [FetchDocumentRecord] = [ + FetchDocumentRecord( + id: "hf-tinystories-row-4-cobweb", + title: "Lily Cleans a Castle Cobweb", + body: """ + Lily liked to pretend she was a popular princess in a big castle with her cat and dog. While playing, she found a cobweb that blocked her game. + + Lily was scared of the spider living there, so she asked her friends to help. Together, the cat, the dog, and Lily cleaned the cobweb, and the spider found a new home outside. + """, + kind: .article, + language: "en", + sourceURI: tinyStoriesSource.url, + metadata: sourceMetadata(tinyStoriesSource, row: "4", topic: "cobweb") + ), + FetchDocumentRecord( + id: "hf-tinystories-row-5-kayak", + title: "The Brown Kayak Rolls in the Lake", + body: """ + In a big lake, a brown kayak liked to roll in the water all day. A little boy named Tim came to play, and they rolled together in the warm lake. + + When Tim went home, the kayak was sad but kept rolling in the water while waiting for another fun day. + """, + kind: .article, + language: "en", + sourceURI: tinyStoriesSource.url, + metadata: sourceMetadata(tinyStoriesSource, row: "5", topic: "kayak") + ), + FetchDocumentRecord( + id: "hf-tinystories-row-6-triangle", + title: "Lily Finds Her Triangle", + body: """ + Lily was troubled because she lost her favorite toy, a triangle. She searched her house, then went to the park and looked in a puddle. + + Lily reached into the water and found the triangle at the bottom. After that, puddles reminded her of finding the toy she loved. + """, + kind: .article, + language: "en", + sourceURI: tinyStoriesSource.url, + metadata: sourceMetadata(tinyStoriesSource, row: "6", topic: "toy") + ), + FetchDocumentRecord( + id: "hf-tinystories-row-7-race", + title: "Tim and Sarah Run a Race", + body: """ + Tim loved to run and play outside. He saw a race in the park and asked Sarah to start the race with him. + + They waited for the word go, then ran with the wind in their hair. Tim reached the finish line first, Sarah came second, and both celebrated with friends. + """, + kind: .article, + language: "en", + sourceURI: tinyStoriesSource.url, + metadata: sourceMetadata(tinyStoriesSource, row: "7", topic: "race") + ), + ] + + public static let simpleWikipediaRecords: [FetchDocumentRecord] = [ + FetchDocumentRecord( + id: "hf-simplewiki-april", + title: "April", + body: """ + **April** is the fourth month of the year in the Julian and Gregorian calendars. It comes between March and May, has 30 days, and is a spring month in the Northern Hemisphere. + + ## The Month + + April begins on the same day of the week as July every year and on the same day as January in leap years. It ends on the same day of the week as December. + + ## April in poetry + + Poets use April to mean the end of winter, as in the saying that April showers bring May flowers. + """, + contentType: .markdown, + kind: .reference, + language: "en", + sourceURI: simpleWikipediaSource.url, + metadata: sourceMetadata(simpleWikipediaSource, row: "0", topic: "calendar") + ), + FetchDocumentRecord( + id: "hf-simplewiki-ad-hominem", + title: "Ad hominem", + body: """ + **Ad hominem** is a Latin term for a type of argument. It is often used in rhetoric, the science of speaking well and convincing other people. + + Translated to English, ad hominem means against the person. It happens when someone attacks the person they are arguing against instead of answering what the person is saying. + + It is usually considered a weak and poor argument, especially in courts and diplomacy. + """, + contentType: .markdown, + kind: .reference, + language: "en", + sourceURI: simpleWikipediaSource.url, + metadata: sourceMetadata(simpleWikipediaSource, row: "18", topic: "rhetoric") + ), + FetchDocumentRecord( + id: "hf-simplewiki-angels", + title: "Angel", + body: """ + **Angels** are described in many religious traditions as spirits or messengers. Some stories describe angels delivering messages, protecting people, or appearing in visions. + + ## Types of Angels + + Cherubs are described as winged creatures. Seraphs are associated with singing and praising God. Archangels such as Gabriel and Michael are described as important messengers. + + ## In art + + Angels are often shown with wings and a halo. The wings represent speed, and the halo represents holiness. + """, + contentType: .markdown, + kind: .reference, + language: "en", + sourceURI: simpleWikipediaSource.url, + metadata: sourceMetadata(simpleWikipediaSource, row: "17", topic: "religion") + ), + ] + + public static let gutenbergPoetryRecords: [FetchDocumentRecord] = [ + FetchDocumentRecord( + id: "hf-poetry-hiawatha-source", + title: "The Song of Hiawatha: Source Traditions", + body: """ + The Song of Hiawatha is based on legends and stories of many North American Indian tribes, especially the Ojibway Indians of northern Michigan, Wisconsin, and Minnesota. + + The material was collected by Henry Rowe Schoolcraft and later used by Longfellow as a basis for the poem. + """, + kind: .article, + language: "en", + sourceURI: gutenbergPoetrySource.url, + metadata: sourceMetadata(gutenbergPoetrySource, row: "0-18", topic: "hiawatha") + ), + FetchDocumentRecord( + id: "hf-poetry-hiawatha-northland", + title: "The Song of Hiawatha: Northland Images", + body: """ + From the forests and the prairies, + From the great lakes of the Northland, + From the land of the Ojibways, + From the land of the Dacotahs, + From the mountains, moors, and fen-lands, + Where the heron feeds among the reeds and rushes. + """, + kind: .article, + language: "en", + sourceURI: gutenbergPoetrySource.url, + metadata: sourceMetadata(gutenbergPoetrySource, row: "40-46", topic: "northland") + ), + FetchDocumentRecord( + id: "hf-poetry-hiawatha-firefly", + title: "The Song of Hiawatha: Firefly", + body: """ + Wah-wah-taysee, little fire-fly, + Little, flitting, white-fire insect, + Little, dancing, white-fire creature, + Light me with your little candle, + Ere upon my bed I lay me. + """, + kind: .article, + language: "en", + sourceURI: gutenbergPoetrySource.url, + metadata: sourceMetadata(gutenbergPoetrySource, row: "24-29", topic: "firefly") + ), + ] + + private static func sourceMetadata( + _ source: GutenbergMiniCorpus.Source, + row: String, + topic: String + ) -> [String: String] { + [ + "fixture.dataset": source.datasetID, + "fixture.config": source.config, + "fixture.split": source.split, + "fixture.row": row, + "fixture.topic": topic, + ] + } +} diff --git a/docs/maintainers/fixture-corpus.md b/docs/maintainers/fixture-corpus.md index cd9062a..1d76780 100644 --- a/docs/maintainers/fixture-corpus.md +++ b/docs/maintainers/fixture-corpus.md @@ -42,6 +42,14 @@ The second fixture pass added a compact-evidence ranking expectation for the def The third fixture pass moved source-derived records into a shared test fixture target and added TinyStories micro-records. Corpus-based tests now have at least two attributed text sources available: one public-domain Gutenberg-derived source and one synthetic story source. +The fourth fixture pass added a small Hugging Face-derived audit corpus that combines: + +- [`roneneldan/TinyStories`](https://huggingface.co/datasets/roneneldan/TinyStories) rows for compact app-note-like narrative records +- [`juno-labs/simple_wikipedia`](https://huggingface.co/datasets/juno-labs/simple_wikipedia) rows for markdown reference records with headings and title/body overlap +- [`biglam/gutenberg-poetry-corpus`](https://huggingface.co/datasets/biglam/gutenberg-poetry-corpus) rows for dense line-oriented literary text + +This is still a checked-in micro-corpus rather than a live dataset lane. Its job is to make the current in-memory and SearchKit-backed result-quality tests cover more document shapes before deciding whether a larger local benchmark or opt-in dataset download is worth maintaining. + ## Hugging Face Dependency Boundary Do not add a Hugging Face Swift dependency for the default fixture lane yet. The current checked-in fixture keeps CI deterministic and avoids adding a network, token, cache, or package-resolution requirement to ordinary tests. @@ -60,6 +68,8 @@ curl -s 'https://datasets-server.huggingface.co/rows?dataset=zkeown/gutenberg-co curl -s 'https://datasets-server.huggingface.co/rows?dataset=zkeown/gutenberg-corpus&config=chapters&split=train&offset=1&length=3' curl -s 'https://datasets-server.huggingface.co/splits?dataset=roneneldan/TinyStories' curl -s 'https://datasets-server.huggingface.co/first-rows?dataset=roneneldan/TinyStories&config=default&split=train' +curl -s 'https://datasets-server.huggingface.co/first-rows?dataset=juno-labs/simple_wikipedia&config=default&split=train' +curl -s 'https://datasets-server.huggingface.co/first-rows?dataset=biglam/gutenberg-poetry-corpus&config=default&split=train' ``` Hugging Face documents dataset parquet discovery through the Dataset Viewer service in the [`huggingface_hub` CLI guide](https://huggingface.co/docs/huggingface_hub/guides/cli) and the Dataset Viewer [Parquet conversion guide](https://huggingface.co/docs/dataset-viewer/parquet).