From 321a54c20cd3fd6a947d59ac312ec682e4c24256 Mon Sep 17 00:00:00 2001 From: Hedgehog-Jr <48445528+Hedgehog-Jr@users.noreply.github.com> Date: Tue, 1 Oct 2024 14:15:04 +0000 Subject: [PATCH] Pending changes exported from your codespace --- dict_vectorizer.bin | Bin 0 -> 13007 bytes docker-compose.yml | 10 ++ mlflow.dockerfile | 12 ++ mlops/metadata.yaml | 1 + mlops/module_3/.gitignore | 14 ++ mlops/module_3/__init__.py | 0 mlops/module_3/charts/__init__.py | 0 mlops/module_3/custom/__init__.py | 0 mlops/module_3/data_exporters/__init__.py | 0 .../data_exporters/export_titanic_clean.py | 16 +++ mlops/module_3/data_exporters/save.py | 35 +++++ mlops/module_3/data_loaders/__init__.py | 0 mlops/module_3/data_loaders/load.py | 28 ++++ mlops/module_3/data_loaders/load_titanic.py | 27 ++++ mlops/module_3/data_s/data_loaders/load.py | 29 ++++ mlops/module_3/dbt/profiles.yml | 9 ++ mlops/module_3/extensions/__init__.py | 0 mlops/module_3/interactions/__init__.py | 0 mlops/module_3/io_config.yaml | 134 ++++++++++++++++++ mlops/module_3/metadata.yaml | 55 +++++++ mlops/module_3/pipelines/__init__.py | 0 .../pipelines/example_pipeline/__init__.py | 0 .../pipelines/example_pipeline/metadata.yaml | 30 ++++ mlops/module_3/pipelines/module_3/__init__.py | 0 .../pipelines/module_3/interactions.yaml | 2 + .../module_3/pipelines/module_3/metadata.yaml | 99 +++++++++++++ mlops/module_3/requirements.txt | 0 mlops/module_3/s/transformers/transform.py | 45 ++++++ mlops/module_3/scratchpads/__init__.py | 0 mlops/module_3/transformers/__init__.py | 0 .../transformers/fill_in_missing_values.py | 45 ++++++ mlops/module_3/transformers/train.py | 49 +++++++ mlops/module_3/transformers/transform.py | 45 ++++++ mlops/module_3/utils/__init__.py | 0 mlops/settings.yaml | 2 + .../pipelines/module_3/__init__.py | 0 .../pipelines/module_3/metadata.yaml | 6 + 37 files changed, 693 insertions(+) create mode 100644 dict_vectorizer.bin create mode 100644 mlflow.dockerfile create mode 100755 mlops/module_3/.gitignore create mode 100755 mlops/module_3/__init__.py create mode 100755 mlops/module_3/charts/__init__.py create mode 100755 mlops/module_3/custom/__init__.py create mode 100755 mlops/module_3/data_exporters/__init__.py create mode 100755 mlops/module_3/data_exporters/export_titanic_clean.py create mode 100644 mlops/module_3/data_exporters/save.py create mode 100755 mlops/module_3/data_loaders/__init__.py create mode 100644 mlops/module_3/data_loaders/load.py create mode 100755 mlops/module_3/data_loaders/load_titanic.py create mode 100644 mlops/module_3/data_s/data_loaders/load.py create mode 100755 mlops/module_3/dbt/profiles.yml create mode 100755 mlops/module_3/extensions/__init__.py create mode 100755 mlops/module_3/interactions/__init__.py create mode 100755 mlops/module_3/io_config.yaml create mode 100755 mlops/module_3/metadata.yaml create mode 100755 mlops/module_3/pipelines/__init__.py create mode 100755 mlops/module_3/pipelines/example_pipeline/__init__.py create mode 100755 mlops/module_3/pipelines/example_pipeline/metadata.yaml create mode 100755 mlops/module_3/pipelines/module_3/__init__.py create mode 100644 mlops/module_3/pipelines/module_3/interactions.yaml create mode 100755 mlops/module_3/pipelines/module_3/metadata.yaml create mode 100755 mlops/module_3/requirements.txt create mode 100644 mlops/module_3/s/transformers/transform.py create mode 100755 mlops/module_3/scratchpads/__init__.py create mode 100755 mlops/module_3/transformers/__init__.py create mode 100755 mlops/module_3/transformers/fill_in_missing_values.py create mode 100644 mlops/module_3/transformers/train.py create mode 100644 mlops/module_3/transformers/transform.py create mode 100755 mlops/module_3/utils/__init__.py create mode 100755 mlops/unit_3_observability/pipelines/module_3/__init__.py create mode 100755 mlops/unit_3_observability/pipelines/module_3/metadata.yaml diff --git a/dict_vectorizer.bin b/dict_vectorizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..738219ef611d33a2959cc2ede19fb4e270c2cc1e GIT binary patch literal 13007 zcmZu&Wtdh~)MXDPqO{!&qT|e&RxvSfgJ6J(;{b!3O$<&TpeUHAWAb8mcXtPNcXxMp zeRrMr`=0N)XTAPe`|P{Uxo6#d&wZEsVdi&Ryd-pq|8)*+n>wwzsdYyAnN$qoH zHWxa(&*+#wb55bN=j3TEP3^VSg++zVUTw`Yn_8Q)MGBoVtkBt`EgQBq7Z!GQYint3 z&#=$`-F!yV^yaq4!uV{(KEp?kY?;`Ejf@yRtg_JAH=n2|boNU!B~sbElodEt%c(j} z)pM$Ww#7HBERo_pR94aImDQZ8;S_Jak~d$;o3G@}^R@y%)4+N{iHWQyu%5tr0_zE^ zC$OHtdUy|2tfz|gRI#3_5^J%ZD%MlQda77Y73-;DJ$%Dz)>F-Ts##Ap>!~iWJnN}u zJ=LtIn)Ot(9$vkM_0+JQ8rD<8dTLltO^G|Qo*LFu!+L614=+{AdTLouE$gXeJ=xbZ z{g~CVp4t+dU_G_0rF@V>RC@c>!~lXRo0Vbl3%w5*3-ax8dy&Q>uF#;4Xmeu^)#@ahD=ZX7d_iT z{uiBpYC;962^FL!RFIlbL25z;sR<&UD>#1Zt><$9EgTU^9f8;}nl9fd7RHIlPr`vnG9&`WA>oh+9EOBLB5)KE4vD}) z$iGgs430s58JvP7Ln3en5)O&r699Xh%pwsm#|ei-&?GqH{3ICTBts%#ixUosfGJKm zBm$N=;gARz;(R5D0y~^v25vaXkO+9;ghL|WgcA;lfDcYMyh9R^CGwNtfs+i0fCElA zBm(|7;gATp--JUV;C&MgM2s1m^tbYfOhmsVQzDfKOIew+G?i^KO=Y`GQ`t7t6rOE< z6MVzyU=tGWAv)NEl|A3&hS9+$EZ%%{unCJdA02F-#oLMwHX*UZ=wK68_PQjSj|MDZ zWiL$1Vm;BpCKFjtbg+3A?;$$agv5HHgH2c^*5dj^2b)Y}J<-7?EY=epY@Wq8j1D#- zv7YE)6Bg@<4mM$NeWHU+SbSKbgH2d`Sa|j5VDs~$gH2NWqDBXsu=sIB2b-|CKGDG@ zEUr&nDxIR2D_P6r$Vt*@1aeZQc zD`9bcVt*@P@k5RMt%Sw(iT$lSi%GG+m5{hTvA>nDxIVGJm9V%zvA>nD_&LY^R>I=? zWSQja8T(uLd9lBhq_{q@zm>4KKC!=*u(&?4zm>4~(Klqf&;Q70yUPEX@~=;^<&|5z z*z!tJB(T`>N?25#Vt*@PQFV&_t%OC@iJhK2DfYLL6uoW5{#L@Gx2@RUN?5EX_O}ui z>xuoXgvEOJnU<(e3DxWlLhLQ&Z^-VT*xyQ6^tKiITM3Kawqk!P&*Epw?x5J;O6Ku{ zi2bdEMW-jQJ1F+Il8IcO*xyQ6Tpzw6yMtnXE58)GgJORxVU<{(-#2y##r{^Z7S|{C zxAH7ro!vpPzm?46`mj4F_P3IWCGN=Y8@q#Ie=Avw>%&X2J1F+I@=LKhDE7A!7S|{K z-+i8yy<5pAHY-Jv$#$OqE@uZNPl^4l+`?!BC!DMbNruP&qmMwmso3^f=er}`PV8bW_#23y6x&iO z{2|0Qi><25`i+P$5_?CVwOw=@yFgFzm5^In?0fyds}Zj!cB!7`Ya;G1c8;Fl7a%@G z>~_7vUet%}NBzj|(yOhXie0VeSbr_^Wy&(nKVmQG5*w+*{BO+pM(lpQ!EZr)so37C zei`D^#Ts-&`(3~ApD_Md>}kE%yJxZf>dPV3Lu@nM$S*{Es@PcF-hRfLscF7iK-`%3IL{oKyelWcR{*tbHiQfyNl zYO8Bs+fHk33*E#H)Cs-|u5BYWPIvG}5Z@;Dpx)%aBK}N#YhVy-(p~KmUF^NHvHtDd z0KeGAy1M1aZMIJF>yf)a>_R=&KGc`&1%1di(KYQd zJ>7fZ;ruCfnjYh?fcL2QNx=DH$Lk^9C5!b3J4xsJLZs)2?XKI|je3!vjd7>=(xCk1 z1F=%KGP_RCx8Yi2i}Xl4Sr=HFPO>Al!$#<)_KeGM{ngY7$g-zs%&Ytj9!Lf5iW^l{tEF z2O-y8{88ZT;>%{J|Jo!S>s_(L&tk1Q(e}_CJ%F-~Sh=p@-y{A&d=cktnX zUn)cW!{=b=VjVi!4@Nvq>|wpdj?i{nQG3}VdYifK}UPNPeyKm*i(9s_eB2pY~#Rx#gD+XcJWn#%Zi^1JW>2J;OpXN0go3y z0eGnReZcF*Yk?bx_r*G0%KQwBj}`wA_>%aEz{A8p2EHP8k6vp#YJ-o&c$nDP+UdU| z{zCjV;N{|{0*??M1>8bxg6`%oBYs4DJaA|6=YbE3ZwA~*Y(E|ChIlXWYk}v9zXV(& zHbA@jk%&9Qe*}Ic{w(l*@mGP5iH+8+d;;R##18=ODSkfiWbuE2--*8md|s?d*Y%qa zUo19MSM}2oFBCgm+k8Ia1H~PkJ?-suwc!T&pz>(tH1M9?p27V&G z0&pqu9=OkM;TA?!0F=41AB@;1H3Q$dSe|a^Yt-a zLA(t(Nql4A>f-%@W#Uc1UB%|=R9jD%x5xEPTSo_3vySsdTpJ@^16*HhEgj%5B7Ru> zIN-tJ*8tBI{}%Y3_-?@A;_Cnhi7y5oEq)L1TCwG|r?(=WD84IjTd~*m3A;%z_6IP& zQS3kc-k#9A{1}X9i{A{qB>OBw(_i9uV|SH!d z$J>vHsR?|Lqn9j7%^$q_A`ESKe)g!zM=KLUbjh^f0Ab*0`V|s@#K-?;Jt6t{Q5RVt1 z3fx!xZQwKFI{@p&2VsrwWqvQl*NIOA?k+wHc!1a^`l{Wmm)K|ey6vyK_yF*`mU%m_ zH;eBK+*15G-~-}U1J4n^6L^)_9Ce$bd;27e$BN$$yh3~v;F{uh0Iw8Z3phY*1zpPb zN4$&pWZ)j+rvZ-ge&RO(FBI>Nt^O*$8peIZzW}}| zJ`XrUd;qYk_+!93#E$~b5`PMKkNB6sx5U>4E+@VUa2e_7?AQ3ef2NZCHA-t+_9r3P QUz${w*OXTlIwq9=59F(oT>t<8 literal 0 HcmV?d00001 diff --git a/docker-compose.yml b/docker-compose.yml index 9decd7127..a827bc14d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,6 +38,16 @@ services: restart: always networks: - app-network + mlflow: + build: + context: . + dockerfile: mlflow.dockerfile + ports: + - "5000:5000" + volumes: + - "${PWD}/mlflow_data:/home/mlflow_data/" + networks: + - app-network networks: app-network: driver: bridge diff --git a/mlflow.dockerfile b/mlflow.dockerfile new file mode 100644 index 000000000..70da0d850 --- /dev/null +++ b/mlflow.dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10-slim + +RUN pip install mlflow==2.12.1 + +EXPOSE 5000 + +CMD [ \ + "mlflow", "server", \ + "--backend-store-uri", "sqlite:///home/mlflow_data/mlflow.db", \ + "--host", "0.0.0.0", \ + "--port", "5000" \ +] \ No newline at end of file diff --git a/mlops/metadata.yaml b/mlops/metadata.yaml index 4eb3b7742..30a37385e 100644 --- a/mlops/metadata.yaml +++ b/mlops/metadata.yaml @@ -15,3 +15,4 @@ features: operation_history: true polars: true help_improve_mage: true +project_uuid: 86daa7e2889a43988632768a1e669081 diff --git a/mlops/module_3/.gitignore b/mlops/module_3/.gitignore new file mode 100755 index 000000000..8b3e82f61 --- /dev/null +++ b/mlops/module_3/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.file_versions +.gitkeep +.log +.logs/ +.mage_temp_profiles +.preferences.yaml +.variables/ +__pycache__/ +docker-compose.override.yml +logs/ +mage-ai.db +mage_data/ +secrets/ diff --git a/mlops/module_3/__init__.py b/mlops/module_3/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/charts/__init__.py b/mlops/module_3/charts/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/custom/__init__.py b/mlops/module_3/custom/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/data_exporters/__init__.py b/mlops/module_3/data_exporters/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/data_exporters/export_titanic_clean.py b/mlops/module_3/data_exporters/export_titanic_clean.py new file mode 100755 index 000000000..cb7aa63aa --- /dev/null +++ b/mlops/module_3/data_exporters/export_titanic_clean.py @@ -0,0 +1,16 @@ +from mage_ai.io.file import FileIO +from pandas import DataFrame + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def export_data_to_file(df: DataFrame, **kwargs) -> None: + """ + Template for exporting data to filesystem. + + Docs: https://docs.mage.ai/design/data-loading#example-loading-data-from-a-file + """ + filepath = 'titanic_clean.csv' + FileIO().export(df, filepath) diff --git a/mlops/module_3/data_exporters/save.py b/mlops/module_3/data_exporters/save.py new file mode 100644 index 000000000..b13bae340 --- /dev/null +++ b/mlops/module_3/data_exporters/save.py @@ -0,0 +1,35 @@ +import mlflow +import pickle + + +mlflow.set_tracking_uri("http://mlflow:5000") +mlflow.set_experiment("nec-taxi-experiment") + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def export_data(data, *args, **kwargs): + """ + Exports data to some source. + + Args: + data: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Output (optional): + Optionally return any object and it'll be logged and + displayed when inspecting the block run. + """ + # Specify your data exporting logic here + dv, lr = data + + with mlflow.start_run(): + with open('dict_vectorizer.bin', 'wb') as f_out: + pickle.dump(dv, f_out) + mlflow.log_artifact('dict_vectorizer.bin') + + mlflow.sklearn.log_model(lr, 'model') + + print('DONE') diff --git a/mlops/module_3/data_loaders/__init__.py b/mlops/module_3/data_loaders/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/data_loaders/load.py b/mlops/module_3/data_loaders/load.py new file mode 100644 index 000000000..8c6c9e54f --- /dev/null +++ b/mlops/module_3/data_loaders/load.py @@ -0,0 +1,28 @@ +import pandas as pd + + +if 'data_loader' not in globals(): + from mage_ai.data_preparation.decorators import data_loader +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@data_loader +def load_data(*args, **kwargs): + """ + Template code for loading data from any source. + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your data loading logic here + df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet') + return df + + +@test +def test_output(output, *args) -> None: + """ + Template code for testing the output of the block. + """ + assert output is not None, 'The output is undefined' \ No newline at end of file diff --git a/mlops/module_3/data_loaders/load_titanic.py b/mlops/module_3/data_loaders/load_titanic.py new file mode 100755 index 000000000..c664e0f2d --- /dev/null +++ b/mlops/module_3/data_loaders/load_titanic.py @@ -0,0 +1,27 @@ +import io +import pandas as pd +import requests +from pandas import DataFrame + +if 'data_loader' not in globals(): + from mage_ai.data_preparation.decorators import data_loader +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@data_loader +def load_data_from_api(**kwargs) -> DataFrame: + """ + Template for loading data from API + """ + url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True' + + return pd.read_csv(url) + + +@test +def test_output(df) -> None: + """ + Template code for testing the output of the block. + """ + assert df is not None, 'The output is undefined' diff --git a/mlops/module_3/data_s/data_loaders/load.py b/mlops/module_3/data_s/data_loaders/load.py new file mode 100644 index 000000000..ca314435e --- /dev/null +++ b/mlops/module_3/data_s/data_loaders/load.py @@ -0,0 +1,29 @@ +import pandas as pd + + +if 'data_loader' not in globals(): + from mage_ai.data_preparation.decorators import data_loader +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@data_loader +def load_data(*args, **kwargs): + """ + Template code for loading data from any source. + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your data loading logic here + df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet') + + return df + + +@test +def test_output(output, *args) -> None: + """ + Template code for testing the output of the block. + """ + assert output is not None, 'The output is undefined' \ No newline at end of file diff --git a/mlops/module_3/dbt/profiles.yml b/mlops/module_3/dbt/profiles.yml new file mode 100755 index 000000000..90599f894 --- /dev/null +++ b/mlops/module_3/dbt/profiles.yml @@ -0,0 +1,9 @@ +# https://docs.getdbt.com/docs/core/connect-data-platform/profiles.yml + +base: + outputs: + + dev: + type: duckdb + + target: dev diff --git a/mlops/module_3/extensions/__init__.py b/mlops/module_3/extensions/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/interactions/__init__.py b/mlops/module_3/interactions/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/io_config.yaml b/mlops/module_3/io_config.yaml new file mode 100755 index 000000000..80b4d9cef --- /dev/null +++ b/mlops/module_3/io_config.yaml @@ -0,0 +1,134 @@ +version: 0.1.1 +default: + # Default profile created for data IO access. + # Add your credentials for the source you use, and delete the rest. + # AWS + AWS_ACCESS_KEY_ID: "{{ env_var('AWS_ACCESS_KEY_ID') }}" + AWS_SECRET_ACCESS_KEY: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}" + AWS_SESSION_TOKEN: session_token (Used to generate Redshift credentials) + AWS_REGION: region + # Algolia + ALGOLIA_APP_ID: app_id + ALGOLIA_API_KEY: api_key + ALGOLIA_INDEX_NAME: index_name + # Azure + AZURE_CLIENT_ID: "{{ env_var('AZURE_CLIENT_ID') }}" + AZURE_CLIENT_SECRET: "{{ env_var('AZURE_CLIENT_SECRET') }}" + AZURE_STORAGE_ACCOUNT_NAME: "{{ env_var('AZURE_STORAGE_ACCOUNT_NAME') }}" + AZURE_TENANT_ID: "{{ env_var('AZURE_TENANT_ID') }}" + # Chroma + CHROMA_COLLECTION: collection_name + CHROMA_PATH: path + # Clickhouse + CLICKHOUSE_DATABASE: default + CLICKHOUSE_HOST: host.docker.internal + CLICKHOUSE_INTERFACE: http + CLICKHOUSE_PASSWORD: null + CLICKHOUSE_PORT: 8123 + CLICKHOUSE_USERNAME: null + # Druid + DRUID_HOST: hostname + DRUID_PASSWORD: password + DRUID_PATH: /druid/v2/sql/ + DRUID_PORT: 8082 + DRUID_SCHEME: http + DRUID_USER: user + # DuckDB + DUCKDB_DATABASE: database + DUCKDB_SCHEMA: main + # Google + GOOGLE_SERVICE_ACC_KEY: + type: service_account + project_id: project-id + private_key_id: key-id + private_key: "-----BEGIN PRIVATE KEY-----\nyour_private_key\n-----END_PRIVATE_KEY" + client_email: your_service_account_email + auth_uri: "https://accounts.google.com/o/oauth2/auth" + token_uri: "https://accounts.google.com/o/oauth2/token" + auth_provider_x509_cert_url: "https://www.googleapis.com/oauth2/v1/certs" + client_x509_cert_url: "https://www.googleapis.com/robot/v1/metadata/x509/your_service_account_email" + GOOGLE_SERVICE_ACC_KEY_FILEPATH: "/path/to/your/service/account/key.json" + GOOGLE_LOCATION: US # Optional + # MongoDB + # Specify either the connection string or the (host, password, user, port) to connect to MongoDB. + MONGODB_CONNECTION_STRING: "mongodb://{username}:{password}@{host}:{port}/" + MONGODB_HOST: host + MONGODB_PORT: 27017 + MONGODB_USER: user + MONGODB_PASSWORD: password + MONGODB_DATABASE: database + MONGODB_COLLECTION: collection + # MSSQL + MSSQL_DATABASE: database + MSSQL_SCHEMA: schema + MSSQL_DRIVER: "ODBC Driver 18 for SQL Server" + MSSQL_HOST: host + MSSQL_PASSWORD: password + MSSQL_PORT: 1433 + MSSQL_USER: SA + # MySQL + MYSQL_DATABASE: database + MYSQL_HOST: host + MYSQL_PASSWORD: password + MYSQL_PORT: 3306 + MYSQL_USER: root + # Pinot + PINOT_HOST: hostname + PINOT_PASSWORD: password + PINOT_PATH: /query/sql + PINOT_PORT: 8000 + PINOT_SCHEME: http + PINOT_USER: user + # PostgresSQL + POSTGRES_CONNECT_TIMEOUT: 10 + POSTGRES_DBNAME: postgres + POSTGRES_SCHEMA: public # Optional + POSTGRES_USER: username + POSTGRES_PASSWORD: password + POSTGRES_HOST: hostname + POSTGRES_PORT: 5432 + # Qdrant + QDRANT_COLLECTION: collection + QDRANT_PATH: path + # Redshift + REDSHIFT_SCHEMA: public # Optional + REDSHIFT_DBNAME: redshift_db_name + REDSHIFT_HOST: redshift_cluster_id.identifier.region.redshift.amazonaws.com + REDSHIFT_PORT: 5439 + REDSHIFT_TEMP_CRED_USER: temp_username + REDSHIFT_TEMP_CRED_PASSWORD: temp_password + REDSHIFT_DBUSER: redshift_db_user + REDSHIFT_CLUSTER_ID: redshift_cluster_id + REDSHIFT_IAM_PROFILE: default + # Snowflake + SNOWFLAKE_USER: username + SNOWFLAKE_PASSWORD: password + SNOWFLAKE_ACCOUNT: account_id.region + SNOWFLAKE_DEFAULT_WH: null # Optional default warehouse + SNOWFLAKE_DEFAULT_DB: null # Optional default database + SNOWFLAKE_DEFAULT_SCHEMA: null # Optional default schema + SNOWFLAKE_PRIVATE_KEY_PASSPHRASE: null # Optional private key passphrase + SNOWFLAKE_PRIVATE_KEY_PATH: null # Optional private key path + SNOWFLAKE_ROLE: null # Optional role name + SNOWFLAKE_TIMEOUT: null # Optional timeout in seconds + # Trino + trino: + catalog: postgresql # Change this to the catalog of your choice + host: 127.0.0.1 + http_headers: + X-Something: 'mage=power' + http_scheme: http + password: mage1337 # Optional + port: 8080 + schema: core_data + session_properties: # Optional + acc01.optimize_locality_enabled: false + optimize_hash_generation: true + source: trino-cli # Optional + user: admin + verify: /path/to/your/ca.crt # Optional + # Weaviate + WEAVIATE_ENDPOINT: https://some-endpoint.weaviate.network + WEAVIATE_INSTANCE_API_KEY: YOUR-WEAVIATE-API-KEY + WEAVIATE_INFERENCE_API_KEY: YOUR-OPENAI-API-KEY + WEAVIATE_COLLECTION: collectionn_name diff --git a/mlops/module_3/metadata.yaml b/mlops/module_3/metadata.yaml new file mode 100755 index 000000000..f208a035a --- /dev/null +++ b/mlops/module_3/metadata.yaml @@ -0,0 +1,55 @@ +project_type: standalone + +variables_dir: ~/.mage_data +# remote_variables_dir: s3://bucket/path_prefix + +variables_retention_period: '90d' + +emr_config: + # You can customize the EMR cluster instance size with the two parameters + master_instance_type: 'r5.4xlarge' + slave_instance_type: 'r5.4xlarge' + + # Configure security groups for EMR cluster instances. + # The default managed security groups are ElasticMapReduce-master and ElasticMapReduce-slave + # master_security_group: 'sg-xxxxxxxxxxxx' + # slave_security_group: 'sg-yyyyyyyyyyyy' + + # If you want to ssh tunnel into EMR cluster, ec2_key_name must be configured. + # You can create a key pair in page https://console.aws.amazon.com/ec2#KeyPairs and download the key file. + # ec2_key_name: '[ec2_key_pair_name]' + +spark_config: + # Application name + app_name: 'my spark app' + # Master URL to connect to + # e.g., spark_master: 'spark://host:port', or spark_master: 'yarn' + spark_master: 'local' + # Executor environment variables + # e.g., executor_env: {'PYTHONPATH': '/home/path'} + executor_env: {} + # Jar files to be uploaded to the cluster and added to the classpath + # e.g., spark_jars: ['/home/path/example1.jar'] + spark_jars: [] + # Path where Spark is installed on worker nodes + # e.g. spark_home: '/usr/lib/spark' + spark_home: + # List of key-value pairs to be set in SparkConf + # e.g., others: {'spark.executor.memory': '4g', 'spark.executor.cores': '2'} + others: {} + # Whether to create custom SparkSession via code and set in kwargs['context'] + use_custom_session: false + # The variable name to set in kwargs['context'], + # e.g. kwargs['context']['spark'] = spark_session + custom_session_var_name: 'spark' + +help_improve_mage: true +notification_config: + alert_on: + - trigger_failure + - trigger_passed_sla + slack_config: + webhook_url: "{{ env_var('MAGE_SLACK_WEBHOOK_URL') }}" + teams_config: + webhook_url: "{{ env_var('MAGE_TEAMS_WEBHOOK_URL') }}" +project_uuid: module_3 diff --git a/mlops/module_3/pipelines/__init__.py b/mlops/module_3/pipelines/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/pipelines/example_pipeline/__init__.py b/mlops/module_3/pipelines/example_pipeline/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/pipelines/example_pipeline/metadata.yaml b/mlops/module_3/pipelines/example_pipeline/metadata.yaml new file mode 100755 index 000000000..c04cfbcf8 --- /dev/null +++ b/mlops/module_3/pipelines/example_pipeline/metadata.yaml @@ -0,0 +1,30 @@ +blocks: +- all_upstream_blocks_executed: true + downstream_blocks: + - fill_in_missing_values + name: load_titanic + status: not_executed + type: data_loader + upstream_blocks: [] + uuid: load_titanic +- all_upstream_blocks_executed: true + downstream_blocks: + - export_titanic_clean + name: fill_in_missing_values + status: not_executed + type: transformer + upstream_blocks: + - load_titanic + uuid: fill_in_missing_values +- all_upstream_blocks_executed: true + downstream_blocks: [] + name: export_titanic_clean + status: not_executed + type: data_exporter + upstream_blocks: + - fill_in_missing_values + uuid: export_titanic_clean +name: example_pipeline +type: python +uuid: example_pipeline +widgets: [] diff --git a/mlops/module_3/pipelines/module_3/__init__.py b/mlops/module_3/pipelines/module_3/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/pipelines/module_3/interactions.yaml b/mlops/module_3/pipelines/module_3/interactions.yaml new file mode 100644 index 000000000..a1d40f831 --- /dev/null +++ b/mlops/module_3/pipelines/module_3/interactions.yaml @@ -0,0 +1,2 @@ +blocks: {} +layout: [] diff --git a/mlops/module_3/pipelines/module_3/metadata.yaml b/mlops/module_3/pipelines/module_3/metadata.yaml new file mode 100755 index 000000000..dd955ba30 --- /dev/null +++ b/mlops/module_3/pipelines/module_3/metadata.yaml @@ -0,0 +1,99 @@ +blocks: +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: module_3/data_loaders/load.py + downstream_blocks: + - transform + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: load + retry_config: null + status: executed + timeout: null + type: data_loader + upstream_blocks: [] + uuid: load +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: module_3/transformers/transform.py + downstream_blocks: + - train + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: transform + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - load + uuid: transform +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: module_3/transformers/train.py + downstream_blocks: + - save + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: train + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - transform + uuid: train +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: module_3/data_exporters/save.py + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: save + retry_config: null + status: executed + timeout: null + type: data_exporter + upstream_blocks: + - train + uuid: save +cache_block_output_in_memory: false +callbacks: [] +concurrency_config: {} +conditionals: [] +created_at: '2024-10-01 13:17:54.330994+00:00' +data_integration: null +description: null +executor_config: {} +executor_count: 1 +executor_type: null +extensions: {} +name: module_3 +notification_config: {} +remote_variables_dir: null +retry_config: {} +run_pipeline_in_one_process: false +settings: + triggers: null +spark_config: {} +tags: [] +type: python +uuid: module_3 +variables_dir: /home/src/mage_data/module_3 +widgets: [] diff --git a/mlops/module_3/requirements.txt b/mlops/module_3/requirements.txt new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/s/transformers/transform.py b/mlops/module_3/s/transformers/transform.py new file mode 100644 index 000000000..42f147036 --- /dev/null +++ b/mlops/module_3/s/transformers/transform.py @@ -0,0 +1,45 @@ +import pandas as pd + + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@transformer +def transform(df, *args, **kwargs): + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + data: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your transformation logic here + df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) + df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) + + df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime + df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) + + df = df[(df.duration >= 1) & (df.duration <= 60)] + + categorical = ['PULocationID', 'DOLocationID'] + df[categorical] = df[categorical].astype(str) + + return df + + +@test +def test_output(output, *args) -> None: + """ + Template code for testing the output of the block. + """ + assert output is not None, 'The output is undefined' \ No newline at end of file diff --git a/mlops/module_3/scratchpads/__init__.py b/mlops/module_3/scratchpads/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/transformers/__init__.py b/mlops/module_3/transformers/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/module_3/transformers/fill_in_missing_values.py b/mlops/module_3/transformers/fill_in_missing_values.py new file mode 100755 index 000000000..b9761c3e2 --- /dev/null +++ b/mlops/module_3/transformers/fill_in_missing_values.py @@ -0,0 +1,45 @@ +from pandas import DataFrame +import math + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + +def select_number_columns(df: DataFrame) -> DataFrame: + return df[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Survived']] + + +def fill_missing_values_with_median(df: DataFrame) -> DataFrame: + for col in df.columns: + values = sorted(df[col].dropna().tolist()) + median_value = values[math.floor(len(values) / 2)] + df[[col]] = df[[col]].fillna(median_value) + return df + + +@transformer +def transform_df(df: DataFrame, *args, **kwargs) -> DataFrame: + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + df (DataFrame): Data frame from parent block. + + Returns: + DataFrame: Transformed data frame + """ + # Specify your transformation logic here + + return fill_missing_values_with_median(select_number_columns(df)) + + +@test +def test_output(df) -> None: + """ + Template code for testing the output of the block. + """ + assert df is not None, 'The output is undefined' diff --git a/mlops/module_3/transformers/train.py b/mlops/module_3/transformers/train.py new file mode 100644 index 000000000..0b720d35a --- /dev/null +++ b/mlops/module_3/transformers/train.py @@ -0,0 +1,49 @@ +from sklearn.feature_extraction import DictVectorizer +from sklearn.linear_model import LinearRegression + + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@transformer +def transform(df, *args, **kwargs): + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + df: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your transformation logic here + categorical = ['PULocationID', 'DOLocationID'] + train_dicts = df[categorical].to_dict(orient='records') + + dv = DictVectorizer() + X_train = dv.fit_transform(train_dicts) + + target = 'duration' + y_train = df[target].values + + lr = LinearRegression() + lr.fit(X_train, y_train) + + print(lr.intercept_) + + return dv, lr + + +@test +def test_output(output, *args) -> None: + """ + Template code for testing the output of the block. + """ + assert output is not None, 'The output is undefined' \ No newline at end of file diff --git a/mlops/module_3/transformers/transform.py b/mlops/module_3/transformers/transform.py new file mode 100644 index 000000000..70bc8957f --- /dev/null +++ b/mlops/module_3/transformers/transform.py @@ -0,0 +1,45 @@ +import pandas as pd + + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@transformer +def transform(df, *args, **kwargs): + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + df: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your transformation logic here + df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) + df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) + + df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime + df.duration = df.duration.dt.total_seconds() / 60 + + df = df[(df.duration >= 1) & (df.duration <= 60)] + + categorical = ['PULocationID', 'DOLocationID'] + df[categorical] = df[categorical].astype(str) + + return df + + +@test +def test_output(output, *args) -> None: + """ + Template code for testing the output of the block. + """ + assert output is not None, 'The output is undefined' \ No newline at end of file diff --git a/mlops/module_3/utils/__init__.py b/mlops/module_3/utils/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/settings.yaml b/mlops/settings.yaml index 48b62b147..235012d96 100644 --- a/mlops/settings.yaml +++ b/mlops/settings.yaml @@ -1,4 +1,6 @@ projects: + module_3: + path: module_3 unit_0_setup: {} unit_1_data_preparation: {} unit_2_training: {} diff --git a/mlops/unit_3_observability/pipelines/module_3/__init__.py b/mlops/unit_3_observability/pipelines/module_3/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/unit_3_observability/pipelines/module_3/metadata.yaml b/mlops/unit_3_observability/pipelines/module_3/metadata.yaml new file mode 100755 index 000000000..52a722f00 --- /dev/null +++ b/mlops/unit_3_observability/pipelines/module_3/metadata.yaml @@ -0,0 +1,6 @@ +created_at: '2024-10-01 13:12:07.987453+00:00' +description: module_3 +name: module_3 +tags: [] +type: python +uuid: module_3