From 62509183b96cda45d395b14728b1f67a893af34a Mon Sep 17 00:00:00 2001 From: Osareniho Gabriel Oni <65869596+osareniho-oni@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:23:21 +0000 Subject: [PATCH] Pending changes exported from your codespace --- .gitconfig | 0 docker-compose.yml | 10 ++ finalized_model.lib | Bin 0 -> 4500 bytes mlflow.dockerfile | 12 ++ mlflow_artifacts/dv_artifact.pkl | Bin 0 -> 13007 bytes mlops/homework_03/.gitignore | 14 ++ mlops/homework_03/__init__.py | 0 mlops/homework_03/charts/__init__.py | 0 mlops/homework_03/custom/__init__.py | 0 mlops/homework_03/data_exporters/__init__.py | 0 mlops/homework_03/data_exporters/build.py | 67 +++++++++ .../data_exporters/export_titanic_clean.py | 16 +++ mlops/homework_03/data_loaders/__init__.py | 0 mlops/homework_03/data_loaders/ingest.py | 28 ++++ .../homework_03/data_loaders/load_titanic.py | 27 ++++ mlops/homework_03/dbt/profiles.yml | 9 ++ mlops/homework_03/extensions/__init__.py | 0 mlops/homework_03/interactions/__init__.py | 0 mlops/homework_03/io_config.yaml | 134 ++++++++++++++++++ mlops/homework_03/metadata.yaml | 55 +++++++ mlops/homework_03/pipelines/__init__.py | 0 .../pipelines/data__preparation/__init__.py | 0 .../data__preparation/interactions.yaml | 2 + .../pipelines/data__preparation/metadata.yaml | 80 +++++++++++ .../pipelines/example_pipeline/__init__.py | 0 .../pipelines/example_pipeline/metadata.yaml | 30 ++++ .../pipelines/sklearn_training/__init__.py | 0 .../pipelines/sklearn_training/metadata.yaml | 6 + mlops/homework_03/requirements.txt | 0 mlops/homework_03/scratchpads/__init__.py | 0 mlops/homework_03/transformers/__init__.py | 0 .../transformers/fill_in_missing_values.py | 45 ++++++ mlops/homework_03/transformers/prepare.py | 38 +++++ mlops/homework_03/utils/__init__.py | 0 mlops/metadata.yaml | 1 + mlops/requirements.txt | 1 + mlops/settings.yaml | 2 + 37 files changed, 577 insertions(+) create mode 100644 .gitconfig create mode 100644 finalized_model.lib create mode 100644 mlflow.dockerfile create mode 100644 mlflow_artifacts/dv_artifact.pkl create mode 100755 mlops/homework_03/.gitignore create mode 100755 mlops/homework_03/__init__.py create mode 100755 mlops/homework_03/charts/__init__.py create mode 100755 mlops/homework_03/custom/__init__.py create mode 100755 mlops/homework_03/data_exporters/__init__.py create mode 100644 mlops/homework_03/data_exporters/build.py create mode 100755 mlops/homework_03/data_exporters/export_titanic_clean.py create mode 100755 mlops/homework_03/data_loaders/__init__.py create mode 100644 mlops/homework_03/data_loaders/ingest.py create mode 100755 mlops/homework_03/data_loaders/load_titanic.py create mode 100755 mlops/homework_03/dbt/profiles.yml create mode 100755 mlops/homework_03/extensions/__init__.py create mode 100755 mlops/homework_03/interactions/__init__.py create mode 100755 mlops/homework_03/io_config.yaml create mode 100755 mlops/homework_03/metadata.yaml create mode 100755 mlops/homework_03/pipelines/__init__.py create mode 100755 mlops/homework_03/pipelines/data__preparation/__init__.py create mode 100644 mlops/homework_03/pipelines/data__preparation/interactions.yaml create mode 100755 mlops/homework_03/pipelines/data__preparation/metadata.yaml create mode 100755 mlops/homework_03/pipelines/example_pipeline/__init__.py create mode 100755 mlops/homework_03/pipelines/example_pipeline/metadata.yaml create mode 100755 mlops/homework_03/pipelines/sklearn_training/__init__.py create mode 100755 mlops/homework_03/pipelines/sklearn_training/metadata.yaml create mode 100755 mlops/homework_03/requirements.txt create mode 100755 mlops/homework_03/scratchpads/__init__.py create mode 100755 mlops/homework_03/transformers/__init__.py create mode 100755 mlops/homework_03/transformers/fill_in_missing_values.py create mode 100644 mlops/homework_03/transformers/prepare.py create mode 100755 mlops/homework_03/utils/__init__.py diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 000000000..e69de29bb diff --git a/docker-compose.yml b/docker-compose.yml index 9decd7127..ed4caff8a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -38,6 +38,16 @@ services: restart: always networks: - app-network + mlflow: + build: + context: . + dockerfile: mlflow.dockerfile + ports: + - "5000:5000" + volumes: + - "${PWD}/mlflow:/home/mlflow/" + networks: + - app-network networks: app-network: driver: bridge diff --git a/finalized_model.lib b/finalized_model.lib new file mode 100644 index 0000000000000000000000000000000000000000..1173672fc32258e8c18eaeb49a296bb554175233 GIT binary patch literal 4500 zcmX|_c{~)_`^SaBSZ6F_%#2y=dt(cQd6YJZNQ)wgQOcI>W-ElKsnEDBms_~BTy0z} z#5@tV&vX7b=Q+>k^-h+|%OZ>Y+XS?z10j6> z$S|{z;IPHfFEl)mA7bXW$3KcM6p&p1UG(JdjpRo~1&4{2)J}KrJjLG$PI{AUu+978(;09qb<&=^rN)$ooa|1H!|iq9bDhq8HB= z5dJk33Z%mV|7UVWxIrvkxN6BB;W447&|t%o#jxqZeTo7?V02sr|Gz%sLAJuI%>Qjm z2$t9j-Q3(<78cvT-3GB};hwCOT#}gBglo}WUl~i7$hEp3w8aibHXPFP3L&Fc(Cp#W zMl>7=6Z<`TdPFp!6W3WxWI*TP6SN2VbbL{ha!ayI0+Ph+gRkY7!=T>Z$-AVKff;(| z^AAHR_*;~wH^x5Xy;v_JrR=1Jxuj*DuA|DhX*R%);jIO~D?UthotWg!N*!PCxO-Ie zc6o;b=d%_ZI8>-J+bjko?e;k%@gbfOWk)J|fdIiHt)VqJ8Ys;B;IC!F#$fMnrLw10 z!0Y0N=S$aeF;m~ddGtC7^N-B(zd8^>s>yY0%CBTds5)oKJgyGSI_dETe5OT~TE8_k zzJDt^+s_&iDd}UZwYg727aiB|RtPv3xe#z7v;J}b3--iV<`>tnaC2mF$hIsF@;`5$ ztevF8wyyi5mJW1$eWEdu#x0k>LT3hn4VIutgPLRs}}_!(K&tib3LulKZlEN$3_R9qChF z!E>iL%}${T2As8S$St5?6?tZ)=eRN!ozi)k;G>4NM=Tv*3{oLp+4qiIn=EcBxbUvP zNflxT{PrfQWR(qCb6_CCV`4Q>Ul4^nk<0tvv6*}rH?mwDR$~uJ{!{76pHBnV!`&C$q^y_VEsj=G6mYZtYpLum7!Yq z^_zm1s+jS1#?AV>8LaB)Gf94>2@<8Lb0>Dt!2tYyf{L}#Lx1STjFT>GI+d%q;~ND_ zP5h7k^*|Mujc!tNvr$INo2S}ydWS_Df4nGE@lb}PWnxZ8>LswF=^8 zoEK=bwQJ(84$?a946A}^puE2B$<=!-cxCRl+3w&c(UoV>RC2%3l*@ZjHQ%KFYuKo0lh?@9AuT5K zex)Q%zaAIWCb@F;Q)TgEV@`X;y;)v-qIdR!fCQH)>^QOK+VJJ%ypFbk1zxyz{apDE z3(V{NXFl|uCKSX)HxI~IqjZ7wahE__%t*X%aiNO|a!1~T`}{+Nh{7YD`4T#Cd5g;} zB?l7Lie2-5zpPKxnSX{Cr$WJ>1TSR|nyG`*nG#B5@tml4+nR}!9&9jX^><4KGN8wB z^4>bG28yk*`ewhJ3k9O1i3(w|_+?$&PZezv$gQX5=*i1st&i0@&Ic-vop<_t>*BsAKHIz`5Jyjn>!DLJS>YIHe{Q7cPA9WcO zJbSJrV961lnd!7c4m+;9NX>3p&yDrf+ zt_!B0#ixx@2(GDrHg^H{m2I~#aJz0q5^U+o$3>8m{97Vd1D52uv7l$ zwq8|rG@aY*PC7NkOWE&iagr?#*Sh&vw7Gf^KGc(CxK9nUj8_S8gam$suW9t6Pa^qC z*d{(&gf=?JDgG(7@SMvJ>k^7*Gd-h=ke3qAb@@?i~y>z^_bsv|4l}C?1bw8km z#*QtF)8W&ijX#ZFYjUE)vfejC-M7^6;86OTH_qd{0`E@EUGl@cmrc&jZ+p}r!l==> zF+mbruWa?Iph=*`7b~m1F}6VED!azXF{s01q>&sa=%XrPzscIjn*Ed$B;`oxc8wz%Y>``gLi z&ES&q3a>klskqWb$FpU`A+A*6cHk+yVpu62-(8A|mUQab_d9y~G8 zFr=Y*jo@6dmN_KIG=2}yvBoT#C;u<`M&2i-yZh{V=S5NZCd2dwZR}IjOG^u2fRUSh ze?XN!aM#9MhT}|VC@%OiTq6sWe-uoRYv||^n#B)3+0WDSeo5&T$Y9cf?mNp?9lUx~ z=lX&D^P+`L`guk%3Ds=F_jD$+V9+C;`(TU#W2BcI=N_5D$C8jYVmG2 z8Vysn-Xx9L>fqWVQmYB|6c9H(Cy$TWz^iWmOD{|cZ~l_=G1FZgiCY`uXM+|*xWCplDZd9`x<5R{p??l5&v=vOCX^yZG%O*9qau*h+#2Bh|c#!dTXZx0a*z&lurd5TQKf)W_ zESHwBLld^7_5{YqNOjGzW6lOpW?Z=c?0PNy#nU9$>%A5(B>S$dw2}hti<|4m zLR8V$xqLdLP8&25OYJ`j)nEY|Llg3;NbPCWHYa@M`6gG-DjSi3Offv+P-6*4zI{w> z;_5?NKh4pt?~}-fsAxJ|!9w9-$Ld{irf|NqxJ9>BA2QraPvyq4!S_mXsmJ*Rk#GFm zO8vj3aA1$|7g=+Cs5UZrVXdi%hu_=8+?S`LNz36*0z(#(S+^dg{6@rh$H&jAb2TB_ z?O3&U{olNIQ{t~GlbM*e#A3O}pXShbvH1C`Y$}$svPvArDdA&fNYb?llwF?Pc9D3Snd5Z5_S$(QNDwwUJ0!&&Kp0o3qxo zF)-)noDzxebg1oq>aikqlGjoxYq|2{Tb|;)`m{>EA`-0ZTwA=j4l z71Rg{lz#bjW>Hq?isLJyAEt6I28&@7=9{GemKNSD{2~Zd$=EUd6_8_TzJpxg1PvN&KZSM+M8= zgl4-3Ou(=7$5m@<26msTHg4Ud1efyC^Y`B6Ky#e5!-uUTIO=rfL3tz-mQ)zBSF9tz zwtzz#fg0mHxpsev)jqOlMC;l8HH(7Y(Jb4J8&s$nz4EiH>8L1g*+ZFNtJgerrbS=U zv;pia@oy>#V!_c{)jN+ap`qz$l(SwB9s5ZI-w&>4;}b9K$hQYHF>BS}Ez*U1Tqd2msj5`S^=fI zYL5FwF)^k`@l95vCP-i2s(RwEGTN6k$h667fcWEeMza;Yyb`X}hBI%t5dFf{?Djzl z)Oj{-EidmBIg}nOcT~_r^VHVg^7M_M-gNkaRmh^=q0fc+42<*kbeuSt-7>;+l}s(0 z@709f9cyWt#S9)$nsp2)Axuw`eft zD!DmQ9XTgzHMI#;_$9;T-IoJI$lmD39CDz7{1ZRTw#E7?cOWX`Y$qG7Rp`zZt5`_( zH(nJw&VtJ}B^z>-Sg_%FCvU^9AG~lD!_b_>#KgqO5%)W6s5@uSYA2&tb42+0S z67*-qjo}Uv>*$xU=+;F&5&tK(@Ya~AWQ*_P+o0lYoSV0seK} zEkv|ETU^(5MH;fV6`00Issn$x%I47?O|-T8e49GKfxZ>Z&5hL*&^~+nA#e4Ns4!3c z=fU81kyPE7`Mmp literal 0 HcmV?d00001 diff --git a/mlflow.dockerfile b/mlflow.dockerfile new file mode 100644 index 000000000..627691e95 --- /dev/null +++ b/mlflow.dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10-slim + +RUN pip install mlflow==2.12.1 + +EXPOSE 5000 + +CMD [ \ + "mlflow", "server", \ + "--backend-store-uri", "sqlite:///home/mlflow/mlflow.db", \ + "--host", "0.0.0.0", \ + "--port", "5000" \ +] \ No newline at end of file diff --git a/mlflow_artifacts/dv_artifact.pkl b/mlflow_artifacts/dv_artifact.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2dacf43da6fbdd7fa2d24de14f48ad9af00ad1c1 GIT binary patch literal 13007 zcmZu&Wtdh~)MXDPqO{!&qT|e&RxvSfgJ6J(;{b!3O$<&TpeUHAWAb8mcXtPNcXxMp zeRrMr`=0N)XTAPe`|P{Uxo6#d&wZEsVdi&Ryd-pq|8)*+n>wwzsdYyAnN$qoH zHWxa(&*+#wb55bN=j3TEP3^VSg++zVUTw`Yn_8Q)MGBoVtkBt`EgQBq7Z!GQYint3 z&#=$`-F!yV^yaq4!uV{(KEp?kY?;`Ejf@yRtg_JAH=n2|boNU!B~sbElodEt%c(j} z)pM$Ww#7HBERo_pR94aImDQZ8;S_Jak~d$;o3G@}^R@y%)4+N{iHWQyu%5tr0_zE^ zC$OHtdUy|2tfz|gRI#3_5^J%ZD%MlQda77Y73-;DJ$%Dz)>F-Ts##Ap>!~iWJnN}u zJ=LtIn)Ot(9$vkM_0+JQ8rD<8dTLltO^G|Qo*LFu!+L614=+{AdTLouE$gXeJ=xbZ z{g~CVp4t+dU_G_0rF@V>RC@c>!~lXRo0Vbl3%w5*3-ax8dy&Q>uF#;4Xmeu^)#@ahD=ZX7d_iT z{uiBpYC;962^FL!RFIlbL25z;sR<&UD>#1Zt><$9EgTU^9f8;}nl9fd7RHIlPr`vnG9&`WA>oh+9EOBLB5)KE4vD}) z$iGgs430s58JvP7Ln3en5)O&r699Xh%pwsm#|ei-&?GqH{3ICTBts%#ixUosfGJKm zBm$N=;gARz;(R5D0y~^v25vaXkO+9;ghL|WgcA;lfDcYMyh9R^CGwNtfs+i0fCElA zBm(|7;gATp--JUV;C&MgM2s1m^tbYfOhmsVQzDfKOIew+G?i^KO=Y`GQ`t7t6rOE< z6MVzyU=tGWAv)NEl|A3&hS9+$EZ%%{unCJdA02F-#oLMwHX*UZ=wK68_PQjSj|MDZ zWiL$1Vm;BpCKFjtbg+3A?;$$agv5HHgH2c^*5dj^2b)Y}J<-7?EY=epY@Wq8j1D#- zv7YE)6Bg@<4mM$NeWHU+SbSKbgH2d`Sa|j5VDs~$gH2NWqDBXsu=sIB2b-|CKGDG@ zEUr&nDxIR2D_P6r$Vt*@1aeZQc zD`9bcVt*@P@k5RMt%Sw(iT$lSi%GG+m5{hTvA>nDxIVGJm9V%zvA>nD_&LY^R>I=? zWSQja8T(uLd9lBhq_{q@zm>4KKC!=*u(&?4zm>4~(Klqf&;Q70yUPEX@~=;^<&|5z z*z!tJB(T`>N?25#Vt*@PQFV&_t%OC@iJhK2DfYLL6uoW5{#L@Gx2@RUN?5EX_O}ui z>xuoXgvEOJnU<(e3DxWlLhLQ&Z^-VT*xyQ6^tKiITM3Kawqk!P&*Epw?x5J;O6Ku{ zi2bdEMW-jQJ1F+Il8IcO*xyQ6Tpzw6yMtnXE58)GgJORxVU<{(-#2y##r{^Z7S|{C zxAH7ro!vpPzm?46`mj4F_P3IWCGN=Y8@q#Ie=Avw>%&X2J1F+I@=LKhDE7A!7S|{K z-+i8yy<5pAHY-Jv$#$OqE@uZNPl^4l+`?!BC!DMbNruP&qmMwmso3^f=er}`PV8bW_#23y6x&iO z{2|0Qi><25`i+P$5_?CVwOw=@yFgFzm5^In?0fyds}Zj!cB!7`Ya;G1c8;Fl7a%@G z>~_7vUet%}NBzj|(yOhXie0VeSbr_^Wy&(nKVmQG5*w+*{BO+pM(lpQ!EZr)so37C zei`D^#Ts-&`(3~ApD_Md>}kE%yJxZf>dPV3Lu@nM$S*{Es@PcF-hRfLscF7iK-`%3IL{oKyelWcR{*tbHiQfyNl zYO8Bs+fHk33*E#H)Cs-|u5BYWPIvG}5Z@;Dpx)%aBK}N#YhVy-(p~KmUF^NHvHtDd z0KeGAy1M1aZMIJF>yf)a>_R=&KGc`&1%1di(KYQd zJ>7fZ;ruCfnjYh?fcL2QNx=DH$Lk^9C5!b3J4xsJLZs)2?XKI|je3!vjd7>=(xCk1 z1F=%KGP_RCx8Yi2i}Xl4Sr=HFPO>Al!$#<)_KeGM{ngY7$g-zs%&Ytj9!Lf5iW^l{tEF z2O-y8{88ZT;>%{J|Jo!S>s_(L&tk1Q(e}_CJ%F-~Sh=p@-y{A&d=cktnX zUn)cW!{=b=VjVi!4@Nvq>|wpdj?i{nQG3}VdYifK}UPNPeyKm*i(9s_eB2pY~#Rx#gD+XcJWn#%Zi^1JW>2J;OpXN0go3y z0eGnReZcF*Yk?bx_r*G0%KQwBj}`wA_>%aEz{A8p2EHP8k6vp#YJ-o&c$nDP+UdU| z{zCjV;N{|{0*??M1>8bxg6`%oBYs4DJaA|6=YbE3ZwA~*Y(E|ChIlXWYk}v9zXV(& zHbA@jk%&9Qe*}Ic{w(l*@mGP5iH+8+d;;R##18=ODSkfiWbuE2--*8md|s?d*Y%qa zUo19MSM}2oFBCgm+k8Ia1H~PkJ?-suwc!T&pz>(tH1M9?p27V&G z0&pqu9=OkM;TA?!0F=41AB@;1H3Q$dSe|a^Yt-a zLA(t(Nql4A>f-%@W#Uc1UB%|=R9jD%x5xEPTSo_3vySsdTpJ@^16*HhEgj%5B7Ru> zIN-tJ*8tBI{}%Y3_-?@A;_Cnhi7y5oEq)L1TCwG|r?(=WD84IjTd~*m3A;%z_6IP& zQS3kc-k#9A{1}X9i{A{qB>OBw(_i9uV|SH!d z$J>vHsR?|Lqn9j7%^$q_A`ESKe)g!zM=KLUbjh^f0Ab*0`V|s@#K-?;Jt6t{Q5RVt1 z3fx!xZQwKFI{@p&2VsrwWqvQl*NIOA?k+wHc!1a^`l{Wmm)K|ey6vyK_yF*`mU%m_ zH;eBK+*15G-~-}U1J4n^6L^)_9Ce$bd;27e$BN$$yh3~v;F{uh0Iw8Z3phY*1zpPb zN4$&pWZ)j+rvZ-ge&RO(FBI>Nt^O*$8peIZzW}}| zJ`XrUd;qYk_+!93#E$~b5`PMKkNB6sx5U>4E+@VUa2e_7?AQ3ef2NZCHA-t+_9r3P QUz${w*OXTjIwq9=59FqjTmS$7 literal 0 HcmV?d00001 diff --git a/mlops/homework_03/.gitignore b/mlops/homework_03/.gitignore new file mode 100755 index 000000000..8b3e82f61 --- /dev/null +++ b/mlops/homework_03/.gitignore @@ -0,0 +1,14 @@ +.DS_Store +.file_versions +.gitkeep +.log +.logs/ +.mage_temp_profiles +.preferences.yaml +.variables/ +__pycache__/ +docker-compose.override.yml +logs/ +mage-ai.db +mage_data/ +secrets/ diff --git a/mlops/homework_03/__init__.py b/mlops/homework_03/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/charts/__init__.py b/mlops/homework_03/charts/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/custom/__init__.py b/mlops/homework_03/custom/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/data_exporters/__init__.py b/mlops/homework_03/data_exporters/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/data_exporters/build.py b/mlops/homework_03/data_exporters/build.py new file mode 100644 index 000000000..c5537bf7d --- /dev/null +++ b/mlops/homework_03/data_exporters/build.py @@ -0,0 +1,67 @@ +import os +import mlflow +import mlflow.sklearn +from sklearn.feature_extraction import DictVectorizer +from sklearn.linear_model import LinearRegression +import joblib + +# Set experiment id +mlflow.set_experiment('mage_lr_experiment') + +# Set tracking uri +# mlflow.set_tracking_uri('http://localhost:5000') + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def export_data(data, *args, **kwargs): + """ + Exports data to some source. + + Args: + data: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Output (optional): + Optionally return any object and it'll be logged and + displayed when inspecting the block run. + """ + # Specify your transformation logic here + df_train = data + + # turn dictionary into vector + dv = DictVectorizer() + train_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict(orient='records') + + # Feature matrix + X_train = dv.fit_transform(train_dicts) + + # Target matrix + target = 'duration' + y_train = df_train[target].values + + # Build model + model = LinearRegression() + model.fit(X_train, y_train) + + # Specify artifact_path + artifact_directory = 'mlflow_artifacts' + + # Create directory if it doesnt exist + os.makedirs(artifact_directory, exist_ok=True) + + # Save and log the artifact (DictVectorizer) + artifact_path = os.path.join(artifact_directory, "dv_artifact.pkl") + + with open(artifact_path, 'wb') as f: + joblib.dump(dv, f) + + # Log the linear regression model with MLflow + with mlflow.start_run(): + mlflow.sklearn.log_model(model, "linear_regression_model") + mlflow.log_param("intercept", model.intercept_) + mlflow.log_artifact(artifact_path) + + return model, dv \ No newline at end of file diff --git a/mlops/homework_03/data_exporters/export_titanic_clean.py b/mlops/homework_03/data_exporters/export_titanic_clean.py new file mode 100755 index 000000000..cb7aa63aa --- /dev/null +++ b/mlops/homework_03/data_exporters/export_titanic_clean.py @@ -0,0 +1,16 @@ +from mage_ai.io.file import FileIO +from pandas import DataFrame + +if 'data_exporter' not in globals(): + from mage_ai.data_preparation.decorators import data_exporter + + +@data_exporter +def export_data_to_file(df: DataFrame, **kwargs) -> None: + """ + Template for exporting data to filesystem. + + Docs: https://docs.mage.ai/design/data-loading#example-loading-data-from-a-file + """ + filepath = 'titanic_clean.csv' + FileIO().export(df, filepath) diff --git a/mlops/homework_03/data_loaders/__init__.py b/mlops/homework_03/data_loaders/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/data_loaders/ingest.py b/mlops/homework_03/data_loaders/ingest.py new file mode 100644 index 000000000..5787b568a --- /dev/null +++ b/mlops/homework_03/data_loaders/ingest.py @@ -0,0 +1,28 @@ +import requests +from io import BytesIO +from typing import List + +import pandas as pd + + +if 'data_loader' not in globals(): + from mage_ai.data_preparation.decorators import data_loader + + +@data_loader +def load_data(*args, **kwargs): + """ + Template code for loading data from any source. + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your data loading logic here + response = requests.get("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet") + + if response.status_code != 200: + raise Exception(response.text) + + df = pd.read_parquet(BytesIO(response.content)) + + return df \ No newline at end of file diff --git a/mlops/homework_03/data_loaders/load_titanic.py b/mlops/homework_03/data_loaders/load_titanic.py new file mode 100755 index 000000000..c664e0f2d --- /dev/null +++ b/mlops/homework_03/data_loaders/load_titanic.py @@ -0,0 +1,27 @@ +import io +import pandas as pd +import requests +from pandas import DataFrame + +if 'data_loader' not in globals(): + from mage_ai.data_preparation.decorators import data_loader +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + + +@data_loader +def load_data_from_api(**kwargs) -> DataFrame: + """ + Template for loading data from API + """ + url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True' + + return pd.read_csv(url) + + +@test +def test_output(df) -> None: + """ + Template code for testing the output of the block. + """ + assert df is not None, 'The output is undefined' diff --git a/mlops/homework_03/dbt/profiles.yml b/mlops/homework_03/dbt/profiles.yml new file mode 100755 index 000000000..90599f894 --- /dev/null +++ b/mlops/homework_03/dbt/profiles.yml @@ -0,0 +1,9 @@ +# https://docs.getdbt.com/docs/core/connect-data-platform/profiles.yml + +base: + outputs: + + dev: + type: duckdb + + target: dev diff --git a/mlops/homework_03/extensions/__init__.py b/mlops/homework_03/extensions/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/interactions/__init__.py b/mlops/homework_03/interactions/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/io_config.yaml b/mlops/homework_03/io_config.yaml new file mode 100755 index 000000000..80b4d9cef --- /dev/null +++ b/mlops/homework_03/io_config.yaml @@ -0,0 +1,134 @@ +version: 0.1.1 +default: + # Default profile created for data IO access. + # Add your credentials for the source you use, and delete the rest. + # AWS + AWS_ACCESS_KEY_ID: "{{ env_var('AWS_ACCESS_KEY_ID') }}" + AWS_SECRET_ACCESS_KEY: "{{ env_var('AWS_SECRET_ACCESS_KEY') }}" + AWS_SESSION_TOKEN: session_token (Used to generate Redshift credentials) + AWS_REGION: region + # Algolia + ALGOLIA_APP_ID: app_id + ALGOLIA_API_KEY: api_key + ALGOLIA_INDEX_NAME: index_name + # Azure + AZURE_CLIENT_ID: "{{ env_var('AZURE_CLIENT_ID') }}" + AZURE_CLIENT_SECRET: "{{ env_var('AZURE_CLIENT_SECRET') }}" + AZURE_STORAGE_ACCOUNT_NAME: "{{ env_var('AZURE_STORAGE_ACCOUNT_NAME') }}" + AZURE_TENANT_ID: "{{ env_var('AZURE_TENANT_ID') }}" + # Chroma + CHROMA_COLLECTION: collection_name + CHROMA_PATH: path + # Clickhouse + CLICKHOUSE_DATABASE: default + CLICKHOUSE_HOST: host.docker.internal + CLICKHOUSE_INTERFACE: http + CLICKHOUSE_PASSWORD: null + CLICKHOUSE_PORT: 8123 + CLICKHOUSE_USERNAME: null + # Druid + DRUID_HOST: hostname + DRUID_PASSWORD: password + DRUID_PATH: /druid/v2/sql/ + DRUID_PORT: 8082 + DRUID_SCHEME: http + DRUID_USER: user + # DuckDB + DUCKDB_DATABASE: database + DUCKDB_SCHEMA: main + # Google + GOOGLE_SERVICE_ACC_KEY: + type: service_account + project_id: project-id + private_key_id: key-id + private_key: "-----BEGIN PRIVATE KEY-----\nyour_private_key\n-----END_PRIVATE_KEY" + client_email: your_service_account_email + auth_uri: "https://accounts.google.com/o/oauth2/auth" + token_uri: "https://accounts.google.com/o/oauth2/token" + auth_provider_x509_cert_url: "https://www.googleapis.com/oauth2/v1/certs" + client_x509_cert_url: "https://www.googleapis.com/robot/v1/metadata/x509/your_service_account_email" + GOOGLE_SERVICE_ACC_KEY_FILEPATH: "/path/to/your/service/account/key.json" + GOOGLE_LOCATION: US # Optional + # MongoDB + # Specify either the connection string or the (host, password, user, port) to connect to MongoDB. + MONGODB_CONNECTION_STRING: "mongodb://{username}:{password}@{host}:{port}/" + MONGODB_HOST: host + MONGODB_PORT: 27017 + MONGODB_USER: user + MONGODB_PASSWORD: password + MONGODB_DATABASE: database + MONGODB_COLLECTION: collection + # MSSQL + MSSQL_DATABASE: database + MSSQL_SCHEMA: schema + MSSQL_DRIVER: "ODBC Driver 18 for SQL Server" + MSSQL_HOST: host + MSSQL_PASSWORD: password + MSSQL_PORT: 1433 + MSSQL_USER: SA + # MySQL + MYSQL_DATABASE: database + MYSQL_HOST: host + MYSQL_PASSWORD: password + MYSQL_PORT: 3306 + MYSQL_USER: root + # Pinot + PINOT_HOST: hostname + PINOT_PASSWORD: password + PINOT_PATH: /query/sql + PINOT_PORT: 8000 + PINOT_SCHEME: http + PINOT_USER: user + # PostgresSQL + POSTGRES_CONNECT_TIMEOUT: 10 + POSTGRES_DBNAME: postgres + POSTGRES_SCHEMA: public # Optional + POSTGRES_USER: username + POSTGRES_PASSWORD: password + POSTGRES_HOST: hostname + POSTGRES_PORT: 5432 + # Qdrant + QDRANT_COLLECTION: collection + QDRANT_PATH: path + # Redshift + REDSHIFT_SCHEMA: public # Optional + REDSHIFT_DBNAME: redshift_db_name + REDSHIFT_HOST: redshift_cluster_id.identifier.region.redshift.amazonaws.com + REDSHIFT_PORT: 5439 + REDSHIFT_TEMP_CRED_USER: temp_username + REDSHIFT_TEMP_CRED_PASSWORD: temp_password + REDSHIFT_DBUSER: redshift_db_user + REDSHIFT_CLUSTER_ID: redshift_cluster_id + REDSHIFT_IAM_PROFILE: default + # Snowflake + SNOWFLAKE_USER: username + SNOWFLAKE_PASSWORD: password + SNOWFLAKE_ACCOUNT: account_id.region + SNOWFLAKE_DEFAULT_WH: null # Optional default warehouse + SNOWFLAKE_DEFAULT_DB: null # Optional default database + SNOWFLAKE_DEFAULT_SCHEMA: null # Optional default schema + SNOWFLAKE_PRIVATE_KEY_PASSPHRASE: null # Optional private key passphrase + SNOWFLAKE_PRIVATE_KEY_PATH: null # Optional private key path + SNOWFLAKE_ROLE: null # Optional role name + SNOWFLAKE_TIMEOUT: null # Optional timeout in seconds + # Trino + trino: + catalog: postgresql # Change this to the catalog of your choice + host: 127.0.0.1 + http_headers: + X-Something: 'mage=power' + http_scheme: http + password: mage1337 # Optional + port: 8080 + schema: core_data + session_properties: # Optional + acc01.optimize_locality_enabled: false + optimize_hash_generation: true + source: trino-cli # Optional + user: admin + verify: /path/to/your/ca.crt # Optional + # Weaviate + WEAVIATE_ENDPOINT: https://some-endpoint.weaviate.network + WEAVIATE_INSTANCE_API_KEY: YOUR-WEAVIATE-API-KEY + WEAVIATE_INFERENCE_API_KEY: YOUR-OPENAI-API-KEY + WEAVIATE_COLLECTION: collectionn_name diff --git a/mlops/homework_03/metadata.yaml b/mlops/homework_03/metadata.yaml new file mode 100755 index 000000000..bf30d6f5d --- /dev/null +++ b/mlops/homework_03/metadata.yaml @@ -0,0 +1,55 @@ +project_type: standalone + +variables_dir: ~/.mage_data +# remote_variables_dir: s3://bucket/path_prefix + +variables_retention_period: '90d' + +emr_config: + # You can customize the EMR cluster instance size with the two parameters + master_instance_type: 'r5.4xlarge' + slave_instance_type: 'r5.4xlarge' + + # Configure security groups for EMR cluster instances. + # The default managed security groups are ElasticMapReduce-master and ElasticMapReduce-slave + # master_security_group: 'sg-xxxxxxxxxxxx' + # slave_security_group: 'sg-yyyyyyyyyyyy' + + # If you want to ssh tunnel into EMR cluster, ec2_key_name must be configured. + # You can create a key pair in page https://console.aws.amazon.com/ec2#KeyPairs and download the key file. + # ec2_key_name: '[ec2_key_pair_name]' + +spark_config: + # Application name + app_name: 'my spark app' + # Master URL to connect to + # e.g., spark_master: 'spark://host:port', or spark_master: 'yarn' + spark_master: 'local' + # Executor environment variables + # e.g., executor_env: {'PYTHONPATH': '/home/path'} + executor_env: {} + # Jar files to be uploaded to the cluster and added to the classpath + # e.g., spark_jars: ['/home/path/example1.jar'] + spark_jars: [] + # Path where Spark is installed on worker nodes + # e.g. spark_home: '/usr/lib/spark' + spark_home: + # List of key-value pairs to be set in SparkConf + # e.g., others: {'spark.executor.memory': '4g', 'spark.executor.cores': '2'} + others: {} + # Whether to create custom SparkSession via code and set in kwargs['context'] + use_custom_session: false + # The variable name to set in kwargs['context'], + # e.g. kwargs['context']['spark'] = spark_session + custom_session_var_name: 'spark' + +help_improve_mage: true +notification_config: + alert_on: + - trigger_failure + - trigger_passed_sla + slack_config: + webhook_url: "{{ env_var('MAGE_SLACK_WEBHOOK_URL') }}" + teams_config: + webhook_url: "{{ env_var('MAGE_TEAMS_WEBHOOK_URL') }}" +project_uuid: homework_03 diff --git a/mlops/homework_03/pipelines/__init__.py b/mlops/homework_03/pipelines/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/pipelines/data__preparation/__init__.py b/mlops/homework_03/pipelines/data__preparation/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/pipelines/data__preparation/interactions.yaml b/mlops/homework_03/pipelines/data__preparation/interactions.yaml new file mode 100644 index 000000000..a1d40f831 --- /dev/null +++ b/mlops/homework_03/pipelines/data__preparation/interactions.yaml @@ -0,0 +1,2 @@ +blocks: {} +layout: [] diff --git a/mlops/homework_03/pipelines/data__preparation/metadata.yaml b/mlops/homework_03/pipelines/data__preparation/metadata.yaml new file mode 100755 index 000000000..6f012911e --- /dev/null +++ b/mlops/homework_03/pipelines/data__preparation/metadata.yaml @@ -0,0 +1,80 @@ +blocks: +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: homework_03/data_loaders/ingest.py + downstream_blocks: + - prepare + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Ingest + retry_config: null + status: executed + timeout: null + type: data_loader + upstream_blocks: [] + uuid: ingest +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: homework_03/transformers/prepare.py + downstream_blocks: + - build + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Prepare + retry_config: null + status: executed + timeout: null + type: transformer + upstream_blocks: + - ingest + uuid: prepare +- all_upstream_blocks_executed: true + color: null + configuration: + file_source: + path: homework_03/data_exporters/build.py + downstream_blocks: [] + executor_config: null + executor_type: local_python + has_callback: false + language: python + name: Build + retry_config: null + status: updated + timeout: null + type: data_exporter + upstream_blocks: + - prepare + uuid: build +cache_block_output_in_memory: false +callbacks: [] +concurrency_config: {} +conditionals: [] +created_at: '2024-06-08 08:12:33.660390+00:00' +data_integration: null +description: Load data and perform feature engineering +executor_config: {} +executor_count: 1 +executor_type: null +extensions: {} +name: Data preparation +notification_config: {} +remote_variables_dir: null +retry_config: {} +run_pipeline_in_one_process: false +settings: + triggers: null +spark_config: {} +tags: [] +type: python +uuid: data__preparation +variables_dir: /home/src/mage_data/homework_03 +widgets: [] diff --git a/mlops/homework_03/pipelines/example_pipeline/__init__.py b/mlops/homework_03/pipelines/example_pipeline/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/pipelines/example_pipeline/metadata.yaml b/mlops/homework_03/pipelines/example_pipeline/metadata.yaml new file mode 100755 index 000000000..c04cfbcf8 --- /dev/null +++ b/mlops/homework_03/pipelines/example_pipeline/metadata.yaml @@ -0,0 +1,30 @@ +blocks: +- all_upstream_blocks_executed: true + downstream_blocks: + - fill_in_missing_values + name: load_titanic + status: not_executed + type: data_loader + upstream_blocks: [] + uuid: load_titanic +- all_upstream_blocks_executed: true + downstream_blocks: + - export_titanic_clean + name: fill_in_missing_values + status: not_executed + type: transformer + upstream_blocks: + - load_titanic + uuid: fill_in_missing_values +- all_upstream_blocks_executed: true + downstream_blocks: [] + name: export_titanic_clean + status: not_executed + type: data_exporter + upstream_blocks: + - fill_in_missing_values + uuid: export_titanic_clean +name: example_pipeline +type: python +uuid: example_pipeline +widgets: [] diff --git a/mlops/homework_03/pipelines/sklearn_training/__init__.py b/mlops/homework_03/pipelines/sklearn_training/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/pipelines/sklearn_training/metadata.yaml b/mlops/homework_03/pipelines/sklearn_training/metadata.yaml new file mode 100755 index 000000000..a05fec5ca --- /dev/null +++ b/mlops/homework_03/pipelines/sklearn_training/metadata.yaml @@ -0,0 +1,6 @@ +created_at: '2024-06-08 20:49:13.470524+00:00' +description: Pipepline for training a LinearRegression model +name: sklearn training +tags: [] +type: python +uuid: sklearn_training diff --git a/mlops/homework_03/requirements.txt b/mlops/homework_03/requirements.txt new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/scratchpads/__init__.py b/mlops/homework_03/scratchpads/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/transformers/__init__.py b/mlops/homework_03/transformers/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/homework_03/transformers/fill_in_missing_values.py b/mlops/homework_03/transformers/fill_in_missing_values.py new file mode 100755 index 000000000..b9761c3e2 --- /dev/null +++ b/mlops/homework_03/transformers/fill_in_missing_values.py @@ -0,0 +1,45 @@ +from pandas import DataFrame +import math + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer +if 'test' not in globals(): + from mage_ai.data_preparation.decorators import test + +def select_number_columns(df: DataFrame) -> DataFrame: + return df[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Survived']] + + +def fill_missing_values_with_median(df: DataFrame) -> DataFrame: + for col in df.columns: + values = sorted(df[col].dropna().tolist()) + median_value = values[math.floor(len(values) / 2)] + df[[col]] = df[[col]].fillna(median_value) + return df + + +@transformer +def transform_df(df: DataFrame, *args, **kwargs) -> DataFrame: + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + df (DataFrame): Data frame from parent block. + + Returns: + DataFrame: Transformed data frame + """ + # Specify your transformation logic here + + return fill_missing_values_with_median(select_number_columns(df)) + + +@test +def test_output(df) -> None: + """ + Template code for testing the output of the block. + """ + assert df is not None, 'The output is undefined' diff --git a/mlops/homework_03/transformers/prepare.py b/mlops/homework_03/transformers/prepare.py new file mode 100644 index 000000000..847578579 --- /dev/null +++ b/mlops/homework_03/transformers/prepare.py @@ -0,0 +1,38 @@ +import pandas as pd +# from mlops.homework_03.data_loaders import ingest + +if 'transformer' not in globals(): + from mage_ai.data_preparation.decorators import transformer + + +@transformer +def transform(data, *args, **kwargs): + """ + Template code for a transformer block. + + Add more parameters to this function if this block has multiple parent blocks. + There should be one parameter for each output variable from each parent block. + + Args: + data: The output from the upstream parent block + args: The output from any additional upstream blocks (if applicable) + + Returns: + Anything (e.g. data frame, dictionary, array, int, str, etc.) + """ + # Specify your transformation logic here + # def read_dataframe(filename): + df = data + + df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime) + df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime) + + df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime + df.duration = df.duration.dt.total_seconds() / 60 + + df = df[(df.duration >= 1) & (df.duration <= 60)] + + categorical = ['PULocationID', 'DOLocationID'] + df[categorical] = df[categorical].astype(str) + + return df diff --git a/mlops/homework_03/utils/__init__.py b/mlops/homework_03/utils/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/mlops/metadata.yaml b/mlops/metadata.yaml index 4eb3b7742..cc0dcfee9 100644 --- a/mlops/metadata.yaml +++ b/mlops/metadata.yaml @@ -15,3 +15,4 @@ features: operation_history: true polars: true help_improve_mage: true +project_uuid: 8a1d9ffcca6e42fa98d38f4b43c70ece diff --git a/mlops/requirements.txt b/mlops/requirements.txt index 5c611efcf..b9de29417 100755 --- a/mlops/requirements.txt +++ b/mlops/requirements.txt @@ -9,3 +9,4 @@ scikit-learn seaborn shap xgboost +joblib diff --git a/mlops/settings.yaml b/mlops/settings.yaml index 48b62b147..32fb040be 100644 --- a/mlops/settings.yaml +++ b/mlops/settings.yaml @@ -1,4 +1,6 @@ projects: + homework_03: + path: homework_03 unit_0_setup: {} unit_1_data_preparation: {} unit_2_training: {}