diff --git a/pytorch/fraud-detection/Fraud-notebook-PyTorch.ipynb b/pytorch/fraud-detection/Fraud-notebook-PyTorch.ipynb new file mode 100644 index 0000000..38d0a22 --- /dev/null +++ b/pytorch/fraud-detection/Fraud-notebook-PyTorch.ipynb @@ -0,0 +1,575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "12d3c59f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "4b2cedbc", + "metadata": {}, + "source": [ + "# Comet + Pytorch: Credit Card Fraud Detection" + ] + }, + { + "cell_type": "markdown", + "id": "62fffd8a", + "metadata": {}, + "source": [ + "[Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=pytorch&utm_medium=colab) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster. Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place. It works with Jupyter Notebooks and Scripts.\n", + "\n", + "[PyTorch](https://pytorch.org/) is a popular open source machine learning framework based on the Torch library, used for applications such as computer vision and natural language processing.\n", + "\n", + "PyTorch enables fast, flexible experimentation and efficient production through a user-friendly front-end, distributed training, and ecosystem of tools and libraries.\n", + "\n", + "Instrument PyTorch with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.\n", + "\n", + "[Find more information about our integration with Pytorch](https://www.comet.ml/docs/v2/integrations/ml-frameworks/pytorch/)\n", + "\n", + "Curious about how Comet can help you build better models, faster? Find out more about [Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=pytorch&utm_medium=colab) and our [other integrations](https://www.comet.ml/docs/v2/integrations/overview/)\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e7aac45", + "metadata": {}, + "source": [ + "## Importing Required Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1c2e6c2", + "metadata": {}, + "outputs": [], + "source": [ + "from comet_ml import Experiment\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "\n", + "# PyTorch imports\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "\n", + "# Sklearn imports\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve\n", + "from comet_ml.integration.pytorch import log_model, watch\n", + "\n", + "# Set device\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "print(f\"PyTorch version: {torch.__version__}\")\n", + "print(f\"Using device: {device}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c241c90", + "metadata": {}, + "source": [ + "## Load Data\n", + "\n", + "Let's read our data using pandas library. The dataset can be downloaded from Kaggle at https://www.kaggle.com/code/rawaaelghali/credit-card-fraud-detection-using-xgboost/data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24f0e7e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the credit card fraud dataset\n", + "# You can download this from Kaggle or use the path from the XGBoost example\n", + "df = pd.read_csv('./creditcard.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cccacc21", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"-\" * 50)\n", + "print('Shape of the dataframe:', df.shape)\n", + "print(\"Number of records in dataset:\", df.shape[0])\n", + "print(\"\\nInformation of the dataset:\")\n", + "df.info()\n", + "print(\"-\" * 50)\n", + "print(\"\\nFirst 5 records of the dataset:\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "3d2fd82f-header", + "metadata": {}, + "source": [ + "## Initialize Comet Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2fd82f", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate Comet Experiment\n", + "experiment = Experiment(\n", + " project_name='fraud_detection',\n", + " # api_key=\"YOUR_API_KEY\", # Uncomment and add your API key\n", + " # workspace=\"YOUR_WORKSPACE\" # Uncomment and add your workspace\n", + ")\n", + "\n", + "experiment.add_tag('pytorch')" + ] + }, + { + "cell_type": "markdown", + "id": "9d4e107b", + "metadata": {}, + "source": [ + "## Log Dataframe Profile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b7f0c4c", + "metadata": {}, + "outputs": [], + "source": [ + "# Log pandas profiling report to Comet\n", + "experiment.log_dataframe_profile(df, \"pandas_profiling_full\", minimal=True)" + ] + }, + { + "cell_type": "markdown", + "id": "08a6119e", + "metadata": {}, + "source": [ + "## Log Dataset Artifact" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb831699", + "metadata": {}, + "outputs": [], + "source": [ + "from comet_ml import Artifact\n", + "\n", + "# Create dataset artifact\n", + "artifact = Artifact(\n", + " name=\"fraud-dataset-pytorch\",\n", + " artifact_type=\"dataset\",\n", + " aliases=[\"raw\"]\n", + ")\n", + "\n", + "artifact.add('./creditcard.csv')\n", + "\n", + "# Log artifact\n", + "experiment.log_artifact(artifact)" + ] + }, + { + "cell_type": "markdown", + "id": "fb78b855", + "metadata": {}, + "source": [ + "## Data Exploration and Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e58aa9be", + "metadata": {}, + "outputs": [], + "source": [ + "# Check class distribution\n", + "print(\"Class distribution:\")\n", + "print(df['Class'].value_counts())\n", + "\n", + "# Log class counts\n", + "fraud_count = df['Class'].sum()\n", + "non_fraud_count = len(df) - fraud_count\n", + "\n", + "# Visualize class distribution\n", + "plt.figure(figsize=(8, 6))\n", + "sns.countplot(x='Class', data=df)\n", + "plt.title('Class Distribution (0: Non-Fraud, 1: Fraud)')\n", + "experiment.log_figure(\"class_distribution\", plt)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "5542e068-scale", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9adf05b7", + "metadata": {}, + "outputs": [], + "source": [ + "# Scale the 'Amount' column\n", + "scaler = StandardScaler()\n", + "df['Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))\n", + "\n", + "# Drop the 'Time' column as it's not useful for our model\n", + "df = df.drop(['Time'], axis=1)\n", + "\n", + "print(df['Amount'].head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "split-features", + "metadata": {}, + "outputs": [], + "source": [ + "# Split features and target\n", + "X = df.drop('Class', axis=1).values.astype(np.float32) # Convert to float32 for faster training\n", + "y = df['Class'].values.astype(np.float32)\n", + "\n", + "print(f\"Features shape: {X.shape}\")\n", + "print(f\"Target shape: {y.shape}\")\n", + "print(f\"Fraud cases: {int(y.sum())} ({y.sum()/len(y)*100:.2f}%)\")" + ] + }, + { + "cell_type": "markdown", + "id": "5542e068", + "metadata": {}, + "source": [ + "## Build PyTorch Model\n", + "Once the model has been defined, use [watch(model)](https://www.comet.com/docs/v2/integrations/ml-frameworks/pytorch/#weightsbiases-and-gradients-logging) to auto-log weights, biases, and gradients to Comet." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "853de1ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Define hyperparameters\n", + "hyper_params = {\n", + " 'test_size': 0.2,\n", + " 'learning_rate': 0.001,\n", + " 'epochs': 20, # Reduced for faster demo (increase for production)\n", + " 'batch_size': 2048, # Increased for faster training with large dataset\n", + " 'hidden_units_1': 128,\n", + " 'hidden_units_2': 64,\n", + " 'hidden_units_3': 32,\n", + " 'dropout_rate': 0.3\n", + "}\n", + "\n", + "experiment.log_parameters(hyper_params)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a437a17f", + "metadata": {}, + "outputs": [], + "source": [ + "# Split data into train and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y,\n", + " test_size=hyper_params['test_size'],\n", + " random_state=42,\n", + " stratify=y\n", + ")\n", + "\n", + "print(f\"Training set size: {X_train.shape[0]}\")\n", + "print(f\"Test set size: {X_test.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "convert-tensors", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert to PyTorch tensors\n", + "X_train_tensor = torch.FloatTensor(X_train).to(device)\n", + "y_train_tensor = torch.FloatTensor(y_train).to(device)\n", + "X_test_tensor = torch.FloatTensor(X_test).to(device)\n", + "y_test_tensor = torch.FloatTensor(y_test).to(device)\n", + "\n", + "# Create DataLoaders\n", + "train_dataset = TensorDataset(X_train_tensor, y_train_tensor)\n", + "train_loader = DataLoader(train_dataset, batch_size=hyper_params['batch_size'], shuffle=True)\n", + "\n", + "test_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n", + "test_loader = DataLoader(test_dataset, batch_size=hyper_params['batch_size'], shuffle=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "686a446c", + "metadata": {}, + "outputs": [], + "source": [ + "class FraudDetectionNet(nn.Module):\n", + " \"\"\"PyTorch Neural Network for Fraud Detection.\"\"\"\n", + " \n", + " def __init__(self, input_dim, params):\n", + " super(FraudDetectionNet, self).__init__()\n", + " \n", + " self.fc1 = nn.Linear(input_dim, params['hidden_units_1'])\n", + " self.bn1 = nn.BatchNorm1d(params['hidden_units_1'])\n", + " self.dropout1 = nn.Dropout(params['dropout_rate'])\n", + " \n", + " self.fc2 = nn.Linear(params['hidden_units_1'], params['hidden_units_2'])\n", + " self.bn2 = nn.BatchNorm1d(params['hidden_units_2'])\n", + " self.dropout2 = nn.Dropout(params['dropout_rate'])\n", + " \n", + " self.fc3 = nn.Linear(params['hidden_units_2'], params['hidden_units_3'])\n", + " self.bn3 = nn.BatchNorm1d(params['hidden_units_3'])\n", + " self.dropout3 = nn.Dropout(params['dropout_rate'])\n", + " \n", + " self.fc4 = nn.Linear(params['hidden_units_3'], 1)\n", + " \n", + " self.relu = nn.ReLU()\n", + " self.sigmoid = nn.Sigmoid()\n", + " \n", + " def forward(self, x):\n", + " x = self.relu(self.bn1(self.fc1(x)))\n", + " x = self.dropout1(x)\n", + " \n", + " x = self.relu(self.bn2(self.fc2(x)))\n", + " x = self.dropout2(x)\n", + " \n", + " x = self.relu(self.bn3(self.fc3(x)))\n", + " x = self.dropout3(x)\n", + " \n", + " x = self.sigmoid(self.fc4(x))\n", + " return x\n", + "\n", + "# Build the model\n", + "model = FraudDetectionNet(X_train.shape[1], hyper_params).to(device)\n", + "\n", + "# Log weights, biases, and gradients to Comet using watch\n", + "watch(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "loss-optimizer", + "metadata": {}, + "outputs": [], + "source": [ + "# Define loss function and optimizer\n", + "criterion = nn.BCELoss()\n", + "optimizer = optim.Adam(model.parameters(), lr=hyper_params['learning_rate'])" + ] + }, + { + "cell_type": "markdown", + "id": "f843f8cd", + "metadata": {}, + "source": [ + "## Train the Model\n", + "\n", + "Train model and log custom metrics to Comet. Loss metric will be auto-logged." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b39252a", + "metadata": {}, + "outputs": [], + "source": [ + "# Training loop\n", + "train_losses = []\n", + "train_accuracies = []\n", + "\n", + "with experiment.train():\n", + " for epoch in range(hyper_params['epochs']):\n", + " model.train()\n", + " running_loss = 0.0\n", + " correct = 0\n", + " total = 0\n", + " \n", + " for batch_X, batch_y in train_loader:\n", + " # Zero the gradients\n", + " optimizer.zero_grad()\n", + " \n", + " # Forward pass\n", + " outputs = model(batch_X).squeeze()\n", + " loss = criterion(outputs, batch_y)\n", + " \n", + " # Backward pass and optimize\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " running_loss += loss.item()\n", + " \n", + " # Calculate accuracy\n", + " predicted = (outputs > 0.5).float()\n", + " total += batch_y.size(0)\n", + " correct += (predicted == batch_y).sum().item()\n", + " \n", + " epoch_loss = running_loss / len(train_loader)\n", + " epoch_acc = correct / total\n", + " \n", + " train_losses.append(epoch_loss)\n", + " train_accuracies.append(epoch_acc)\n", + " \n", + " # Log metrics to Comet\n", + " #experiment.log_metric('train_loss', epoch_loss, step=epoch)\n", + " experiment.log_metric('accuracy', epoch_acc, step=epoch)\n", + " experiment.log_current_epoch(epoch)\n", + " \n", + " if (epoch + 1) % 10 == 0:\n", + " print(f'Epoch [{epoch+1}/{hyper_params[\"epochs\"]}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')" + ] + }, + { + "cell_type": "markdown", + "id": "752ff259", + "metadata": {}, + "source": [ + "## Log Model\n", + "Log model to Comet using Comet's [pytorch integration](https://www.comet.com/docs/v2/integrations/ml-frameworks/pytorch/#pytorch-model-saving-and-loading). Logging the model will later allow us to register the model to Comet's Model Registry. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "save-model", + "metadata": {}, + "outputs": [], + "source": [ + "# Save and log the model to Comet\n", + "# torch.save(model.state_dict(), 'fraud_model_pytorch.pth')\n", + "# experiment.log_model('fraud-demo-pytorch', 'fraud_model_pytorch.pth')\n", + "\n", + "log_model(experiment, model, \"pytorch-fraud-model\")" + ] + }, + { + "cell_type": "markdown", + "id": "08bdd6cd", + "metadata": {}, + "source": [ + "## Evaluate the Model\n", + "Use log_metric to log any evaluation metrics to Comet. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25bccd1", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluation\n", + "with experiment.test():\n", + " model.eval()\n", + " with torch.no_grad():\n", + " y_pred_proba = model(X_test_tensor).cpu().numpy().flatten()\n", + " y_pred = (y_pred_proba > 0.5).astype(int)\n", + " \n", + " # Calculate metrics\n", + " accuracy = accuracy_score(y_test, y_pred)\n", + " precision = precision_score(y_test, y_pred)\n", + " recall = recall_score(y_test, y_pred)\n", + " f1 = f1_score(y_test, y_pred, average='macro')\n", + " \n", + " print(f\"Accuracy: {accuracy * 100:.2f}%\")\n", + " print(f\"Precision: {precision * 100:.2f}%\")\n", + " print(f\"Recall: {recall * 100:.2f}%\")\n", + " print(f\"F1 Score: {f1 * 100:.2f}%\")\n", + " \n", + " # Log metrics to Comet\n", + " experiment.log_metric('accuracy', accuracy)\n", + " experiment.log_metric('precision', precision)\n", + " experiment.log_metric('recall', recall)\n", + " experiment.log_metric('f1_score', f1)" + ] + }, + { + "cell_type": "markdown", + "id": "e63be58d", + "metadata": {}, + "source": [ + "## Log Curve\n", + "Log ROC curve to Comet, which will be able to be displayed interactively with Comet's Curves Panel. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6063e64", + "metadata": {}, + "outputs": [], + "source": [ + "# Log ROC Curve\n", + "fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n", + "experiment.log_curve(\"ROC_Curve\", x=fpr, y=tpr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec983a66", + "metadata": {}, + "outputs": [], + "source": [ + "# End the experiment\n", + "experiment.end()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pytorch/fraud-detection/creditcard.csv.zip b/pytorch/fraud-detection/creditcard.csv.zip new file mode 100644 index 0000000..f8ed07c Binary files /dev/null and b/pytorch/fraud-detection/creditcard.csv.zip differ