From f7efbdecf15f814d98272bad10ee0741e193e38b Mon Sep 17 00:00:00 2001 From: michael Date: Wed, 27 Nov 2024 09:59:50 +0100 Subject: [PATCH] added scripts for snowflake --- Snowflake/README.md | 10 ++ Snowflake/create_database.sql | 176 +++++++++++++++++++++ Snowflake/refresh_parties.sql | 13 ++ Snowflake/refresh_recording_alt_titles.sql | 14 ++ Snowflake/refresh_recording_ids.sql | 12 ++ Snowflake/refresh_recordings.sql | 14 ++ Snowflake/refresh_release_ids.sql | 14 ++ Snowflake/refresh_releases.sql | 15 ++ Snowflake/refresh_unclaimed.sql | 14 ++ Snowflake/refresh_work_alt_titles.sql | 14 ++ Snowflake/refresh_work_ids.sql | 14 ++ Snowflake/refresh_work_recordings.sql | 14 ++ Snowflake/refresh_work_shares.sql | 14 ++ Snowflake/refresh_works.sql | 14 ++ Snowflake/update_snapshot.sql | 10 ++ 15 files changed, 362 insertions(+) create mode 100644 Snowflake/README.md create mode 100644 Snowflake/create_database.sql create mode 100644 Snowflake/refresh_parties.sql create mode 100644 Snowflake/refresh_recording_alt_titles.sql create mode 100644 Snowflake/refresh_recording_ids.sql create mode 100644 Snowflake/refresh_recordings.sql create mode 100644 Snowflake/refresh_release_ids.sql create mode 100644 Snowflake/refresh_releases.sql create mode 100644 Snowflake/refresh_unclaimed.sql create mode 100644 Snowflake/refresh_work_alt_titles.sql create mode 100644 Snowflake/refresh_work_ids.sql create mode 100644 Snowflake/refresh_work_recordings.sql create mode 100644 Snowflake/refresh_work_shares.sql create mode 100644 Snowflake/refresh_works.sql create mode 100644 Snowflake/update_snapshot.sql diff --git a/Snowflake/README.md b/Snowflake/README.md new file mode 100644 index 0000000..b1d7e0f --- /dev/null +++ b/Snowflake/README.md @@ -0,0 +1,10 @@ +# Snowflake +Load Scripts to Snowflake from Snowflake stage (external or internal). + +we use S3 folder connected to snowflake stage (called @IMPORTSTAGE) +see [Snowflake user guide](https://docs.snowflake.com/en/user-guide/data-load-s3-create-stage) + +all .tsv files are stored in gzipped format in S3 folder +using plain .tsv files should work but you have to change the filenames + +_Michael Ettl - Sonoton Music_ \ No newline at end of file diff --git a/Snowflake/create_database.sql b/Snowflake/create_database.sql new file mode 100644 index 0000000..65c9aae --- /dev/null +++ b/Snowflake/create_database.sql @@ -0,0 +1,176 @@ +-- Michael Ettl - Sonoton Music +-- +-- Create BWARM table structure in Snowflake +-- Attention - this will drop all your data, +-- if you want to use different snapshots don't run this if tables already exist +-- +-- select your schema +USE DB.SCHEMA; + +-- create TSV import format settings +CREATE OR REPLACE FILE FORMAT TSV_FILE_FORMAT TYPE = 'CSV' COMPRESSION = 'AUTO' +FIELD_DELIMITER = '\t' RECORD_DELIMITER = '\n' SKIP_HEADER = 0 +FIELD_OPTIONALLY_ENCLOSED_BY = 'NONE' TRIM_SPACE = FALSE +ERROR_ON_COLUMN_COUNT_MISMATCH = TRUE ESCAPE = 'NONE' +ESCAPE_UNENCLOSED_FIELD = '\134' DATE_FORMAT = 'AUTO' +TIMESTAMP_FORMAT = 'AUTO' NULL_IF = ('\\N'); + +CREATE OR REPLACE TABLE MLC_SNAPSHOTS ( + snapshotid INT NOT NULL AUTOINCREMENT, + created_date DATETIME, + PRIMARY KEY (snapshotid) +); + +CREATE OR REPLACE TABLE MLC_WORKS ( + FeedProvidersWorkId VARCHAR(3000), + ISWC VARCHAR(11), + WorkTitle VARCHAR, + OpusNumber VARCHAR(3000), + ComposerCatalogNumber VARCHAR(3000), + NominalDuration VARCHAR(3000), + HasRightsInDispute BOOLEAN, + TerritoryOfPublicDomain VARCHAR, + IsArrangementOfTraditionalWork BOOLEAN, + AlternativeWorkForUsStatutoryReversion VARCHAR(3000), + UsStatutoryReversionDate VARCHAR(100), + snapshotid INT, + PRIMARY KEY (FeedProvidersWorkId) +); + +CREATE OR REPLACE TABLE MLC_ALTERNATIVE_WORK_TITLES ( + FeedProvidersWorkAlternativeTitleId VARCHAR(3000), + FeedProvidersWorkId VARCHAR(3000), + AlternativeTitle VARCHAR, + LanguageAndScriptCode VARCHAR(3000), + TitleType VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersWorkAlternativeTitleId) +); + +CREATE OR REPLACE TABLE MLC_WORK_IDENTIFIERS ( + FeedProvidersWorkProprietaryIdentifierId VARCHAR(3000), + FeedProvidersWorkId VARCHAR(3000), + Identifier VARCHAR(3000), + FeedProvidersAllocatingPartyId VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersWorkProprietaryIdentifierId) +); + +CREATE OR REPLACE TABLE MLC_PARTIES ( + FeedProvidersPartyId VARCHAR(3000), + ISNI VARCHAR(15), + IpiNameNumber INTEGER, + CisacSocietyId VARCHAR(3), + DPID VARCHAR(100), + FullName VARCHAR, + NamesBeforeKeyName VARCHAR, + KeyName VARCHAR, + NamesAfterKeyName VARCHAR, + snapshotid INT, + PRIMARY KEY (FeedProvidersPartyId) +); + +CREATE OR REPLACE TABLE MLC_WORK_RIGHT_SHARES ( + FeedProvidersWorkRightShareId VARCHAR(3000), + FeedProvidersWorkId VARCHAR(3000), + FeedProvidersPartyId VARCHAR(3000), + PartyRole VARCHAR(100), + RightSharePercentage FLOAT, + RightShareType VARCHAR(100), + RightsType VARCHAR(100), + ValidityStartDate DATE, + ValidityEndDate DATE, + FeedProvidersParentWorkRightShareId VARCHAR(3000), + TerritoryCode VARCHAR(3000), + UseType VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersWorkRightShareId) +); + +CREATE OR REPLACE TABLE MLC_RECORDINGS ( + FeedProvidersRecordingId VARCHAR(3000), + ISRC VARCHAR(12), + RecordingTitle VARCHAR, + RecordingSubTitle VARCHAR, + DisplayArtistName VARCHAR, + DisplayArtistISNI VARCHAR(16), + PLine VARCHAR(3000), + Duration VARCHAR(100), + FeedProvidersReleaseId VARCHAR(3000), + StudioProducerName VARCHAR, + StudioProducerId VARCHAR(3000), + OriginalDataProviderName VARCHAR(3000), + OriginalDataProviderDPID VARCHAR(3000), + IsDataProvidedAsReceived BOOLEAN, + snapshotid INT, + PRIMARY KEY (FeedProvidersRecordingId) +); + +CREATE OR REPLACE TABLE MLC_ALTERNATIVE_RECORDING_TITLES ( + FeedProvidersRecordingAlternativeTitleId VARCHAR(3000), + FeedProvidersRecordingId VARCHAR(3000), + AlternativeTitle VARCHAR, + LanguageAndScriptCode VARCHAR(3000), + TitleType VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersRecordingAlternativeTitleId) +); + +CREATE OR REPLACE TABLE MLC_RECORDING_IDENTIFIERS ( + FeedProvidersRecordingProprietaryIdentifierId VARCHAR(3000), + FeedProvidersRecordingId VARCHAR(3000), + Identifier VARCHAR(3000), + FeedProvidersAllocatingPartyId VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersRecordingProprietaryIdentifierId) +); + +CREATE OR REPLACE TABLE MLC_RELEASES ( + FeedProvidersReleaseId VARCHAR(3000), + ICPN VARCHAR(15), + ReleaseTitle VARCHAR, + ReleaseSubTitle VARCHAR, + DisplayArtistName VARCHAR, + DisplayArtistISNI VARCHAR(16), + LabelName VARCHAR(3000), + ReleaseDate DATE, + OriginalDataProviderName VARCHAR(3000), + OriginalDataProviderDPID VARCHAR(3000), + IsDataProvidedAsReceived BOOLEAN, + snapshotid INT, + PRIMARY KEY (FeedProvidersReleaseId) +); + +CREATE OR REPLACE TABLE MLC_RELEASE_IDENTIFIERS ( + FeedProvidersReleaseProprietaryIdentifierId VARCHAR(3000), + FeedProvidersReleaseId VARCHAR(3000), + Identifier VARCHAR(3000), + FeedProvidersAllocatingPartyId VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersReleaseProprietaryIdentifierId) +); + +CREATE OR REPLACE TABLE MLC_WORK_RECORDINGS ( + FeedProvidersLinkId VARCHAR(3000), + FeedProvidersWorkId VARCHAR(3000), + FeedProvidersRecordingId VARCHAR(3000), + snapshotid INT, + PRIMARY KEY (FeedProvidersLinkId) +); + +CREATE OR REPLACE TABLE MLC_UNCLAIMED_WORKS ( + FeedProvidersRightShareId VARCHAR(3000), + FeedProvidersRecordingId VARCHAR(3000), + FeedProvidersWorkId VARCHAR(3000), + ISRC VARCHAR(12), + DspRecordingId VARCHAR(3000), + RecordingTitle VARCHAR, + RecordingSubTitle VARCHAR, + AlternativeRecordingTitle VARCHAR, + DisplayArtistName VARCHAR, + DisplayArtistISNI VARCHAR(16), + Duration VARCHAR(100), + UnclaimedPercentage FLOAT, + PercentileForPrioritisation FLOAT, + snapshotid INT +); \ No newline at end of file diff --git a/Snowflake/refresh_parties.sql b/Snowflake/refresh_parties.sql new file mode 100644 index 0000000..251594a --- /dev/null +++ b/Snowflake/refresh_parties.sql @@ -0,0 +1,13 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +COPY INTO MLC_PARTIES +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/parties.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; + diff --git a/Snowflake/refresh_recording_alt_titles.sql b/Snowflake/refresh_recording_alt_titles.sql new file mode 100644 index 0000000..988ddc6 --- /dev/null +++ b/Snowflake/refresh_recording_alt_titles.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_ALTERNATIVE_RECORDING_TITLES; + +COPY INTO MLC_ALTERNATIVE_RECORDING_TITLES +FROM ( + SELECT $1, $2, $3, $4, $5, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/recordingalternativetitles.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; \ No newline at end of file diff --git a/Snowflake/refresh_recording_ids.sql b/Snowflake/refresh_recording_ids.sql new file mode 100644 index 0000000..a345145 --- /dev/null +++ b/Snowflake/refresh_recording_ids.sql @@ -0,0 +1,12 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_RECORDING_IDENTIFIERS; + +COPY INTO MLC_RECORDING_IDENTIFIERS +FROM ( + SELECT $1, $2, $3, $4, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/recordingidentifiers.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT; \ No newline at end of file diff --git a/Snowflake/refresh_recordings.sql b/Snowflake/refresh_recordings.sql new file mode 100644 index 0000000..f32cbe1 --- /dev/null +++ b/Snowflake/refresh_recordings.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_RECORDINGS; + +COPY INTO MLC_RECORDINGS +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/recordings.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; \ No newline at end of file diff --git a/Snowflake/refresh_release_ids.sql b/Snowflake/refresh_release_ids.sql new file mode 100644 index 0000000..a030034 --- /dev/null +++ b/Snowflake/refresh_release_ids.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_RELEASE_IDENTIFIERS; + +COPY INTO MLC_RELEASE_IDENTIFIERS +FROM ( + SELECT $1, $2, $3, $4, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/releaseidentifiers.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; \ No newline at end of file diff --git a/Snowflake/refresh_releases.sql b/Snowflake/refresh_releases.sql new file mode 100644 index 0000000..b81a8de --- /dev/null +++ b/Snowflake/refresh_releases.sql @@ -0,0 +1,15 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_RELEASES; + +COPY INTO MLC_RELEASES +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/releases.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; + diff --git a/Snowflake/refresh_unclaimed.sql b/Snowflake/refresh_unclaimed.sql new file mode 100644 index 0000000..e7d5aa4 --- /dev/null +++ b/Snowflake/refresh_unclaimed.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_UNCLAIMED_WORKS; + +COPY INTO MLC_UNCLAIMED_WORKS +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/unclaimedworkrightshares.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; diff --git a/Snowflake/refresh_work_alt_titles.sql b/Snowflake/refresh_work_alt_titles.sql new file mode 100644 index 0000000..eda42ca --- /dev/null +++ b/Snowflake/refresh_work_alt_titles.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_ALTERNATIVE_WORK_TITLES; + +COPY INTO MLC_ALTERNATIVE_WORK_TITLES +FROM ( + SELECT $1, $2, $3, $4, $5, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/workalternativetitles.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; diff --git a/Snowflake/refresh_work_ids.sql b/Snowflake/refresh_work_ids.sql new file mode 100644 index 0000000..8d169b8 --- /dev/null +++ b/Snowflake/refresh_work_ids.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_WORKS; + +COPY INTO MLC_WORKS +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/works.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; diff --git a/Snowflake/refresh_work_recordings.sql b/Snowflake/refresh_work_recordings.sql new file mode 100644 index 0000000..7969caa --- /dev/null +++ b/Snowflake/refresh_work_recordings.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_WORK_RECORDINGS; + +COPY INTO MLC_WORK_RECORDINGS +FROM ( + SELECT $1, $2, $3, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/worksrecordings.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; \ No newline at end of file diff --git a/Snowflake/refresh_work_shares.sql b/Snowflake/refresh_work_shares.sql new file mode 100644 index 0000000..46a4085 --- /dev/null +++ b/Snowflake/refresh_work_shares.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_WORK_RIGHT_SHARES; + +COPY INTO MLC_WORK_RIGHT_SHARES +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/workrightshares.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; diff --git a/Snowflake/refresh_works.sql b/Snowflake/refresh_works.sql new file mode 100644 index 0000000..83af1b7 --- /dev/null +++ b/Snowflake/refresh_works.sql @@ -0,0 +1,14 @@ + +SET snapshotid=ifnull((SELECT max(snapshotid) from MLC_SNAPSHOTS),0); + +TRUNCATE MLC_WORKS; + +COPY INTO MLC_WORKS +FROM ( + SELECT $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $snapshotid + FROM @IMPORTSTAGE +) +FILES = ('/mlc/works.tsv.gz') +FILE_FORMAT = TSV_FILE_FORMAT +ON_ERROR = CONTINUE +; \ No newline at end of file diff --git a/Snowflake/update_snapshot.sql b/Snowflake/update_snapshot.sql new file mode 100644 index 0000000..4557a16 --- /dev/null +++ b/Snowflake/update_snapshot.sql @@ -0,0 +1,10 @@ +-- Michael Ettl - Sonoton Music +-- +-- use snowflake to load BWARM data from snowflake stage into snowflake tables +-- we use S3 folder connected to snowflake stage - see https://docs.snowflake.com/en/user-guide/data-load-s3-create-stage +-- all .tsv files are stored in gzipped format in S3 folder +-- if you can use plain .tsv files this should work but don't forget to change the filenames +-- + +-- create snapshotid with timestamp +INSERT INTO MLC_SNAPSHOTS (created_date) VALUES (CURRENT_TIMESTAMP);