-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinit.sql
More file actions
60 lines (47 loc) · 2.39 KB
/
init.sql
File metadata and controls
60 lines (47 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
CREATE TABLE platforms (
platform_id TEXT PRIMARY KEY, -- e.g. 'bayt', 'wuzzuf', 'linkedin'
name TEXT NOT NULL,
base_url TEXT NOT NULL,
crawler_type TEXT NOT NULL -- 'scrapy' | 'playwright' | 'hybrid'
);
-- -------------------------------------------------------------
-- 2. CRAWL RUNS
-- One row per spider execution. Audit trail.
-- -------------------------------------------------------------
CREATE TABLE crawl_runs (
run_id SERIAL PRIMARY KEY,
platform_id TEXT NOT NULL REFERENCES platforms(platform_id),
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
finished_at TIMESTAMPTZ,
status TEXT NOT NULL DEFAULT 'running', -- 'running' | 'done' | 'failed'
jobs_found INT DEFAULT 0
);
CREATE TABLE raw_jobs (
raw_id SERIAL PRIMARY KEY,
platform_id TEXT NOT NULL REFERENCES platforms(platform_id),
run_id INT NOT NULL REFERENCES crawl_runs(run_id),
url TEXT NOT NULL UNIQUE, -- deduplication guard
scraped_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
payload JSONB NOT NULL -- raw HTML + metadata, untouched
);
CREATE TABLE extracted_jobs (
extracted_id SERIAL PRIMARY KEY,
raw_id INT NOT NULL UNIQUE REFERENCES raw_jobs(raw_id),
title TEXT,
company TEXT,
location TEXT,
job_type TEXT, -- 'full-time' | 'part-time' | 'internship' | 'remote'
language TEXT, -- 'en' | 'ar' | 'bilingual'
description TEXT,
skills TEXT[], -- e.g. ARRAY['Python', 'FastAPI', 'Docker']
posted_at DATE,
extra_fields JSONB DEFAULT '{}', -- platform-specific fields
extracted_at TIMESTAMPTZ DEFAULT NOW(),
extractor_model TEXT -- e.g. 'llama-3.3-70b-versatile'
);
INSERT INTO platforms (platform_id, name, base_url, crawler_type) VALUES
('bayt', 'Bayt.com', 'https://www.bayt.com', 'playwright'),
('wuzzuf', 'Wuzzuf', 'https://wuzzuf.net', 'playwright'),
('linkedin', 'LinkedIn', 'https://www.linkedin.com', 'playwright'),
('indeed', 'Indeed Egypt', 'https://eg.indeed.com', 'scrapy'),
('forasna', 'Forasna', 'https://www.forasna.com', 'hybrid');