diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile new file mode 100644 index 000000000..f5c02c562 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile @@ -0,0 +1,30 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 + +CMD ["/bin/bash"] diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.python_slim b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.python_slim new file mode 100644 index 000000000..cc8f18f2f --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.python_slim @@ -0,0 +1,28 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.ubuntu b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.ubuntu new file mode 100644 index 000000000..705105d91 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.ubuntu @@ -0,0 +1,40 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* + +# Create virtual environment. +RUN python3 -m venv /opt/venv + +# Make the venv the default Python. +ENV PATH="/opt/venv/bin:$PATH" + +# Install project specific packages. +RUN mkdir /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.uv b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.uv new file mode 100644 index 000000000..d3b2a0abc --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/Dockerfile.uv @@ -0,0 +1,49 @@ +FROM ubuntu:24.04 +ENV DEBIAN_FRONTEND noninteractive + +# Install system utilities and Python in a single layer. +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y --no-install-recommends \ + sudo \ + curl \ + git \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + libgomp1 \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Install uv for package management. +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +# Install project specific packages using uv. +COPY pyproject.toml uv.lock /app/ +WORKDIR /app +RUN uv sync +ENV PATH="/app/.venv/bin:$PATH" + +# Install Jupyter. +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext + +# Copy project files. +COPY . /app + +RUN mkdir /install + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/README.md b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/README.md new file mode 100644 index 000000000..58d90e2d1 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/README.md @@ -0,0 +1,802 @@ +# Summary +This directory contains a Docker-based development environment template with: + +- Utility scripts for Docker operations (build, run, clean, push) +- Configuration files for Dockerfile and environment setup +- Jupyter notebook templates for standardized project development +- Shell utilities and Python helpers for container-based workflows + +A guide to set up Docker-based projects using the template, customize it for +your needs, and maintain it over time. + +## Description of Files +- `bashrc` + - Bash configuration file enabling `vi` mode for command-line editing + +- `copy_docker_files.py` + - Python script for copying Docker configuration files to destination + directories + +- `docker_build.version.log` + - Log file containing Python, `pip`, Jupyter, and package version information + from Docker build + +- `docker_cmd.sh` + - Shell script for executing arbitrary commands inside Docker containers with + volume mounting + +- `docker_jupyter.sh` + - Shell script for launching Jupyter Lab server inside Docker containers + +- `docker_name.sh` + - Configuration file defining Docker repository and image naming variables + +- `Dockerfile` + - Docker image build configuration with Ubuntu, Python, Jupyter, and project + dependencies + +- `etc_sudoers` + - Sudoers configuration file granting passwordless sudo access for postgres + user + +- `README.md` + - Documentation file describing directory contents, files, and executable + scripts + +- `template_utils.py` + - Python utility functions supporting tutorial notebooks with data processing + and modeling helpers + +- `template.API.ipynb` + - Jupyter notebook template for API exploration and library usage examples + +- `template.example.ipynb` + - Jupyter notebook template for project examples and demonstrations + +- `utils.sh` + - Bash utility library with reusable functions for Docker operations + - Provides centralized argument parsing (`parse_default_args`) for `-h` and + `-v` flags used by all `docker_*.sh` scripts + - Provides Jupyter configuration logic: vim keybindings, notification + settings, and Docker run option builders + - All `docker_*.sh`, `docker_jupyter.sh`, and `run_jupyter.sh` scripts across + the repo source this file from `class_project/project_template/utils.sh` + +## Workflows +- All commands should be run from inside the project directory + ```bash + > cd tutorials/FilterPy + ``` + +- To build the container for a project + ```bash + > cd $PROJECT + # Build the container. + > docker_build.sh + # Build without cache (pass extra args after -v). + > docker_build.sh --no-cache + # Test the container. + > docker_bash.sh ls + ``` + +- Enable verbose (trace) output with `-v` + ```bash + > docker_build.sh -v + > docker_bash.sh -v + ``` + +- Get help for any docker script + ```bash + > docker_build.sh -h + > docker_jupyter.sh -h + ``` + +- Start Jupyter + ```bash + > docker_jupyter.sh + # Go to localhost:8888 + ``` + +- Start Jupyter on a specific port with vim support + ```bash + > docker_jupyter.sh -p 8890 -u + # Go to localhost:8890 + ``` + +## How to Customize a Project Template +- Copy the template + ```bash + > cp -r class_project/project_template $TARGET + ``` + +## Description of Executables + +### `copy_docker_files.py` +- **What It Does** + - Copies Docker configuration and utility files from project_template to a + destination directory + - Preserves all file permissions and attributes during copying + - Creates destination directory if it doesn't exist + +- Copy all Docker files to a target directory: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination + ``` + +- Copy with verbose logging: + ```bash + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + ``` + +### `docker_bash.sh` +- **What It Does** + - Launches an interactive bash shell inside a Docker container + - Mounts the current working directory as `/data` inside the container + - Exposes port 8888 for potential services running in the container + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Launch bash shell in the container: + ```bash + > ./docker_bash.sh + ``` + +- Launch with verbose output (prints each command): + ```bash + > ./docker_bash.sh -v + ``` + +### `docker_build.sh` +- **What It Does** + - Builds Docker container images using Docker BuildKit + - Supports single-architecture builds (default) or multi-architecture builds + (`linux/arm64`, `linux/amd64`) + - Copies project files to temporary build directory and generates build logs + - Accepts `-h` (help) and `-v` (verbose/trace) flags; any extra arguments + after flags are forwarded to `docker build` + +- Build container image for current architecture: + ```bash + > ./docker_build.sh + ``` + +- Build without Docker layer cache: + ```bash + > ./docker_build.sh --no-cache + ``` + +- Build multi-architecture image (requires setting `DOCKER_BUILD_MULTI_ARCH=1` + in the script): + ```bash + > # Edit docker_build.sh to set DOCKER_BUILD_MULTI_ARCH=1 + > ./docker_build.sh + ``` + +### `docker_clean.sh` +- **What It Does** + +- Removes all Docker images matching the project's full image name +- Lists images before and after removal for verification +- Uses force removal to ensure cleanup completes + +- Remove project's Docker images: + ```bash + > ./docker_clean.sh + ``` + +### `docker_cmd.sh` +- **What It Does** + - Executes arbitrary commands inside a Docker container + - Mounts current directory as `/data` for accessing project files + - Automatically removes container after command execution completes + - Accepts `-h` (help) and `-v` (verbose/trace) flags; remaining arguments + form the command to execute + +- Run Python script inside container: + ```bash + > ./docker_cmd.sh python script.py --arg value + ``` + +- List files in the container: + ```bash + > ./docker_cmd.sh ls -la /data + ``` + +- Run tests inside container: + ```bash + > ./docker_cmd.sh pytest tests/ + ``` + +### `docker_exec.sh` +- **What It Does** + - Attaches to an already running Docker container with an interactive bash + shell + - Finds the container ID automatically based on the image name + - Useful for debugging or inspecting running containers + - Accepts `-h` (help) and `-v` (verbose/trace) flags via `parse_default_args` + +- Attach to running container: + ```bash + > ./docker_exec.sh + ``` + +### `docker_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server inside a Docker container + - Supports custom port configuration (default 8888), vim keybindings, and + custom directory mounting + - Runs `run_jupyter.sh` script inside the container with specified options + +- Start Jupyter on default port 8888: + ```bash + > ./docker_jupyter.sh + ``` + +- Start Jupyter on custom port with vim bindings: + ```bash + > ./docker_jupyter.sh -p 8889 -u + ``` + +- Start Jupyter with external directory mounted: + ```bash + > ./docker_jupyter.sh -d /path/to/notebooks -p 8889 + ``` + +- Start Jupyter in verbose mode: + ```bash + > ./docker_jupyter.sh -v -p 8890 + ``` + +### `docker_push.sh` +- **What It Does** + - Authenticates to Docker registry using credentials from + `~/.docker/passwd.$REPO_NAME.txt` + - Pushes the project's Docker image to the remote repository + - Lists images before pushing for verification + +- Push container image to registry: + ```bash + > ./docker_push.sh + ``` + +### `run_jupyter.sh` +- **What It Does** + - Launches Jupyter Lab server with no authentication (token and password + disabled) + - Binds to all network interfaces (0.0.0.0) on port 8888 + - Allows root access for container environments + - When `JUPYTER_USE_VIM=1`, verifies that `jupyterlab_vim` is installed + before enabling vim keybindings; exits with an error if not found + +- Start Jupyter Lab server (typically called from docker_jupyter.sh): + ```bash + > ./run_jupyter.sh + ``` + +- Start with vim keybindings (requires `jupyterlab_vim` installed in the + container): + ```bash + > JUPYTER_USE_VIM=1 ./run_jupyter.sh + ``` + +### `utils.sh` +- **What It Does** + - Central Bash library sourced by all `docker_*.sh` and `run_jupyter.sh` + scripts across the repository + - Provides `parse_default_args` which adds `-h` (help) and `-v` + (verbose/`set -x`) flags to every docker script + - Provides `build_container_image`, `push_container_image`, + `remove_container_image`, `kill_container`, `exec_container` utilities + - Provides Jupyter configuration helpers: vim keybindings, notification + suppression, and Docker run option builders + +### `version.sh` +- **What It Does** + - Reports version information for Python3, pip3, and Jupyter + - Lists all installed Python packages with versions + - Used during Docker image builds to log environment configuration + +- Display version information: + ```bash + > ./version.sh + ``` + +- Save version information to a log file: + ```bash + > ./version.sh 2>&1 | tee version.log + ``` + +# Template Customization and Maintenance + +## Quick Start for New Projects + +### Step 1: Copy the Template +```bash +> cd class_project/project_template +> cp -r . /path/to/your/new/project +> cd /path/to/your/new/project +``` + +### Step 2: Choose a Base Image +The template includes three Dockerfile options. Choose the one that best fits +your project: + +| Option | File | Best For | +| -------------------------- | ------------------------ | ---------------------------------------------------------------- | +| **Standard** | `Dockerfile.ubuntu` | Full Ubuntu environment with system tools | +| **Lightweight** | `Dockerfile.python_slim` | Minimal Python environment; reduced image size | +| **Modern Package Manager** | `Dockerfile.uv` | Fast dependency resolution with [uv](https://docs.astral.sh/uv/) | + +**How to choose:** + +- **Use Standard** if you need system-level tools (git, curl, graphviz, etc.) +- **Use Python Slim** to minimize image size and build time +- **Use uv** if you want faster, more reliable dependency management + +### Step 3: Set Up Your Dockerfile +- Delete unused reference files + ```bash + > rm Dockerfile.ubuntu Dockerfile.python_slim Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu Dockerfile + ``` + +- Add your dependencies + ```bash + > echo "numpy\npandas\nscikit-learn" > requirements.in + > pip-compile requirements.in > requirements.txt + ``` + +### Step 4: Keep Customization Minimal +- Only modify what's necessary for your project +- Use `requirements.txt` for all Python packages (don't edit Dockerfile for + this) +- Keep `bashrc` and `etc_sudoers` as-is unless you need custom shell setup +- Keep base image and Python version unless you have specific requirements + +## Understanding the Dockerfile Flow +Each Dockerfile follows the same structure. Here are the key stages: + +### Stage 1: Base Image and System Setup +```dockerfile +FROM ubuntu:24.04 # or python:3.12-slim, depending on your requirement +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get -y update && apt-get -y upgrade +``` + +- **Purpose**: Start with a clean base image and disable interactive + installation prompts + +- **When to customize**: Only change the base image or version if your project + has specific requirements (different Ubuntu version, specific Python version, + etc.) + +### Stage 2: System Utilities (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt install -y --no-install-recommends \ + sudo \ + curl \ + systemctl \ + gnupg \ + git \ + vim +``` + +- **Purpose**: Install essential system tools for development and container + management + +- **When to customize**: Add only if needed for your project + - `postgresql-client`: for database connections + - `graphviz`: for graph visualizations + - `ffmpeg`: for media processing + +- **Best practice**: Use `--no-install-recommends` to keep the image small + +### Stage 3: Python and Build Tools (Ubuntu-based Dockerfiles Only) +```dockerfile +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + python3 \ + python3-pip \ + python3-dev \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* +``` + +- **Purpose**: Install Python 3, pip, and build tools needed for compiled + packages + +- **Why venv**: Creates an isolated Python environment separate from system + Python + +- **When to customize**: Rarely. Only change if you need a specific Python + version (e.g., `python3.11` instead of `python3`) + +### Stage 4: Virtual Environment Setup +```dockerfile +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +RUN python -m pip install --upgrade pip +``` + +- **Purpose**: Create and activate an isolated virtual environment for your + project + +- **Why this matters**: Ensures reproducibility and prevents dependency + conflicts across projects + +- **When to customize**: Never. This is a standard best practice + +### Stage 5: Jupyter Installation +```dockerfile +RUN pip install jupyterlab jupyterlab_vim +``` + +- **Purpose**: Install JupyterLab and the Vim keybinding extension for + interactive development + - `jupyterlab`: the main IDE for running notebooks in the browser + - `jupyterlab_vim`: adds Vim-style navigation to notebook cells + +- **Why in Dockerfile, not requirements.txt**: These are infrastructure + packages (the IDE itself), not project-specific dependencies + - Do NOT add `jupyterlab`, `jupyterlab-vim`, or `ipywidgets` to + `requirements.txt`; they are already installed here + +- **When to customize**: + - **Remove** this line if your project doesn't use Jupyter + - **Add more extensions** if needed (e.g., `jupyterlab-git`, + `jupyterlab-variableinspector`) + +### Stage 6: Project Dependencies +```dockerfile +COPY requirements.txt /install/requirements.txt +RUN pip install --no-cache-dir -r /install/requirements.txt +``` + +- **Purpose**: Install your project-specific Python packages + +- **When to customize**: This is the primary place to customize. Define all your + dependencies in `requirements.txt` + +- **Best practice**: + - **Pin all versions**: `numpy==1.24.0` (not `numpy>=1.20.0`) + - **Use `--no-cache-dir`**: Reduces image size by skipping pip cache + - **For complex dependencies**: Use `requirements.in` with `pip-tools` or + `pip-compile` + +- **Example requirements.txt**: + ```text + numpy==1.24.0 + pandas==2.0.0 + scikit-learn==1.2.2 + tensorflow==2.13.0 + ``` + +### Stage 7: Configuration +```dockerfile +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc +``` + +- **Purpose**: Apply custom bash configuration and sudo permissions + +- **When to customize**: + - **Edit `bashrc`**: to add aliases, environment variables, or custom prompt + - **Edit `etc_sudoers`**: if additional users need passwordless sudo access + +### Stage 8: Version Logging +```dockerfile +ADD version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log +``` + +- **Purpose**: Document the exact versions of Python, pip, Jupyter, and all + installed packages + +- **What it logs**: + - Python 3 version + - Pip version + - Jupyter version + - Complete list of all installed Python packages + +- **Why it matters**: Creates a detailed record of your container's environment + for troubleshooting and reproducibility + +- **How to use**: After building, review `version.log` to verify all + dependencies installed correctly + ```bash + > docker build -t my-project . + > cat version.log + ``` + +- **Extending it**: If you need to log additional tools (MongoDB, Node.js, + etc.), add them to `version.sh`: + ```bash + > echo "# mongo" + > mongod --version + ``` + +### Stage 9: Port Declaration +```dockerfile +EXPOSE 8888 +``` + +- **Purpose**: Declare that the container uses port 8888 (informational for + Docker) + +- **When to customize**: Add additional ports if your application needs them + (e.g., `EXPOSE 8888 5432 3000`) + +## Best Practices: Keep It Simple + +### The Core Principle +Only change what's necessary for your project. Everything else should inherit +from the template. + +This approach: + +- Makes Dockerfiles easier to understand and maintain +- Keeps images smaller and faster to build +- Simplifies future updates from the template +- Ensures consistency across similar projects + +### How to Do It Right +| What | Where | Example | +| :--------------------------- | :--------------------------- | :------------------------------ | +| Project Python packages | `requirements.txt` | `numpy==1.24.0` | +| Jupyter + Vim (always there) | Dockerfile Stage 5 | `jupyterlab jupyterlab_vim` | +| System tools | Dockerfile `apt-get` section | `postgresql-client` | +| Shell aliases | `bashrc` | `alias jlab="jupyter lab"` | +| Custom scripts | `scripts/` directory | Setup or initialization scripts | +| User permissions | `etc_sudoers` | Grant passwordless sudo | + +- **Do NOT add to `requirements.txt`**: `jupyterlab`, `jupyterlab-vim`, + `jupyterlab_vim`, or `ipywidgets` — these are Jupyter infrastructure packages + and are already installed in Stage 5 of the Dockerfile + +### Wrong Vs. Right Approach +- **Wrong**: Embed everything in the Dockerfile + ```dockerfile + RUN pip install my-package && python my_setup.py && npm install + ``` + +- **Right**: Use separate files and keep Dockerfile clean + ```dockerfile + COPY requirements.txt /install/ + RUN pip install -r /install/requirements.txt + COPY scripts/setup.sh /install/ + RUN /install/setup.sh + ``` + +## .Dockerignore Policy + +### Why It Matters +The `.dockerignore` file prevents unnecessary files from being added to the +Docker build context: + +- **Reduces build time**: Fewer files to transfer to Docker daemon +- **Reduces image size**: Only necessary files are included +- **Improves security**: Prevents leaking sensitive data + +### What to Exclude: Category Breakdown +- Python Artifacts (Always Exclude) + ```verbatim + __pycache__/ + *.pyc + *.pyo + *.pyd + ``` + - Why: Compiled bytecode generated at runtime. Regenerated in container, adds + bloat + +- Virtual Environments (Always Exclude) + ```verbatim + venv/ + .venv/ + env/ + .env/ + ``` + - Why: Local venvs aren't portable to containers. The Dockerfile creates its + own + +- Jupyter Checkpoints (Always Exclude) + ```verbatim + .ipynb_checkpoints/ + ``` + - Why: Auto-generated by Jupyter, not needed in the image + +- Git and Version Control (Always Exclude) + ```verbatim + .git/ + .gitignore + .gitattributes + ``` + - Why: Repository history not needed at runtime + +- Docker Build Scripts (Always Exclude) + ```verbatim + docker_build.sh + docker_push.sh + docker_clean.sh + docker_exec.sh + docker_cmd.sh + docker_bash.sh + docker_jupyter.sh + docker_name.sh + Dockerfile.* + ``` + - Why: Local development scripts don't run inside the container + +- Large Data Files (Recommended) + ```verbatim + data/ + *.csv + *.pkl + *.h5 + *.parquet + ``` + - Why: Don't ship large training and test data in the image. Mount via volume + instead + - Best practice: `bash > docker run -v /path/to/data:/data my-image ` + +- Test Files (Project-Dependent) + ```verbatim + tests/ + tutorials/ + ``` + - Why: Exclude if tests don't run in the container + - When to include: If CI and CD runs tests inside the container + +- Documentation (Recommended) + ```verbatim + README.md + docs/ + *.md + ``` + - Why: Not needed at runtime + - Exception: Only keep if your app reads these files at runtime + +- Generated Files (Always Exclude) + ```verbatim + *.log + *.tmp + *.cache + build/ + dist/ + ``` + - Why: Generated at runtime, not needed in the image + +## Workflow: From Template to Your Project + +### Complete Setup Checklist +- Copy the template + ```bash + > cp -r project_template my-new-project + > cd my-new-project + ``` + +- Keep all reference Dockerfiles + ```verbatim + Dockerfile.ubuntu_24_04 + Dockerfile.python_slim + Dockerfile.uv + ``` + +- Create your working Dockerfile + ```bash + > cp Dockerfile.ubuntu_24_04 Dockerfile + ``` + +- Add your dependencies + ```bash + > pip freeze > requirements.txt + ``` + +- Configure `.dockerignore`: Review the template `.dockerignore` and add your + project-specific exclusions (e.g., data directories) + +- Test the build + ```bash + > docker build -t my-project:latest . + > docker run -it my-project:latest bash + ``` + +- Test Jupyter (if using) + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Document customizations in your project README: + - Base image chosen and why + - Key dependencies + - Any Dockerfile modifications + - How to build and run + +## Maintaining Your Setup + +### Document Any Changes +- If you modify the Dockerfile, add explanatory comments: + ```dockerfile + # Custom: PostgreSQL client for database access + postgresql-client \ + + # Custom: Node.js for frontend builds + nodejs \ + ``` + +### Monitor Package Versions +- After each build, review `version.log`: + ```bash + > docker build -t my-project . + > cat version.log + ``` + +### Keep `.dockerignore` Updated +- If you add new directories or files, update `.dockerignore`. Add to + `.dockerignore` if the directory shouldn't be in the image: + ```verbatim + data/ + cache/ + .temp/ + ``` + +### Contribute Improvements Back +When you improve your project's Docker setup: + +- Test thoroughly in your project +- Document the improvement clearly +- Submit back to `project_template` +- Other projects can adopt it when they update + +Example improvements: + +- Better way to install TensorFlow with GPU support +- Optimized `.dockerignore` for data science projects +- Security hardening (non-root user setup) + +## Troubleshooting + +### Build Is Slow +- Check `.dockerignore`: Ensure large directories (data/, .git/) are excluded +- Check Docker daemon: Verify Docker is running properly +- Check layer caching: Docker reuses cached layers; avoid changing early layers + +### Image Is Too Large +- Check layer sizes: + ```bash + > docker history my-project:latest + ``` + +- Remove unnecessary packages or use `python_slim` base image + +### Package Not Found Error +- Verify package name in PyPI (packages are case-sensitive) +- Check Python version compatibility +- Pin specific version if needed + +### Permission Issues in Container +- Check `etc_sudoers`: Ensure user has appropriate permissions +- Check file ownership: Ensure COPY doesn't create root-only files + +### Jupyter Won't Connect +- Run Jupyter + ```bash + > ./docker_jupyter.sh -p 8888 + ``` + +- Verify http://localhost:8888 (not https). Check firewall if remote access + needed + +### Vim Keybindings Not Working +- If `run_jupyter.sh` exits with `ERROR: jupyterlab_vim is not installed`, it + means `jupyterlab_vim` is missing from the container image +- Make sure `jupyterlab_vim` is installed in the Dockerfile: + ```dockerfile + RUN pip install jupyterlab jupyterlab_vim + ``` +- Rebuild the image after adding the package: + ```bash + > ./docker_build.sh + ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/bashrc b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/bashrc new file mode 100644 index 000000000..4b7ff4c49 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/bashrc @@ -0,0 +1 @@ +set -o vi diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/copy_docker_files.py b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/copy_docker_files.py new file mode 100644 index 000000000..0e97c194c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/copy_docker_files.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +""" +Copy Docker-related files from the source directory to a destination directory. + +This script copies all Docker configuration and utility files from +class_project/project_template/ to a specified destination directory. + +Usage examples: + # Copy all files to a target directory. + > ./copy_docker_files.py --dst_dir /path/to/destination + + # Copy with verbose logging. + > ./copy_docker_files.py --dst_dir /path/to/destination -v DEBUG + +Import as: + +import class_project.project_template.copy_docker_files as cpdccodo +""" + +import argparse +import logging +import os +from typing import List + +import helpers.hdbg as hdbg +import helpers.hio as hio +import helpers.hparser as hparser +import helpers.hsystem as hsystem + +_LOG = logging.getLogger(__name__) + +# ############################################################################# +# Constants +# ############################################################################# + +# List of files to copy from the source directory. +_FILES_TO_COPY = [ + "bashrc", + "docker_bash.sh", + "docker_build.sh", + "docker_clean.sh", + "docker_cmd.sh", + "docker_exec.sh", + "docker_jupyter.sh", + "docker_name.sh", + "docker_push.sh", + "etc_sudoers", + "install_jupyter_extensions.sh", + "run_jupyter.sh" + "version.sh", +] + + +# ############################################################################# +# Helper functions +# ############################################################################# + + +def _get_source_dir() -> str: + """ + Get the absolute path to the source directory containing Docker files. + + :return: absolute path to class_project/project_template/ + """ + # Get the directory where this script is located. + script_dir = os.path.dirname(os.path.abspath(__file__)) + _LOG.debug("Script directory='%s'", script_dir) + return script_dir + + +def _copy_files( + *, + src_dir: str, + dst_dir: str, + files: List[str], +) -> None: + """ + Copy specified files from source directory to destination directory. + + :param src_dir: source directory path + :param dst_dir: destination directory path + :param files: list of filenames to copy + """ + # Verify source directory exists. + hdbg.dassert_dir_exists(src_dir, "Source directory does not exist:", src_dir) + # Create destination directory if it doesn't exist. + hio.create_dir(dst_dir, incremental=True) + _LOG.info("Copying %d files from '%s' to '%s'", len(files), src_dir, dst_dir) + # Copy each file. + copied_count = 0 + for filename in files: + src_path = os.path.join(src_dir, filename) + dst_path = os.path.join(dst_dir, filename) + # Verify source file exists. + hdbg.dassert_path_exists( + src_path, "Source file does not exist:", src_path + ) + # Copy the file using cp -a to preserve all permissions and attributes. + _LOG.debug("Copying '%s' -> '%s'", src_path, dst_path) + cmd = f"cp -a {src_path} {dst_path}" + hsystem.system(cmd) + copied_count += 1 + # + _LOG.info("Successfully copied %d files", copied_count) + + +# ############################################################################# + + +def _parse() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--dst_dir", + action="store", + required=True, + help="Destination directory where files will be copied", + ) + hparser.add_verbosity_arg(parser) + return parser + + +def _main(parser: argparse.ArgumentParser) -> None: + args = parser.parse_args() + hdbg.init_logger(verbosity=args.log_level, use_exec_path=True) + # Get source directory. + src_dir = _get_source_dir() + # Copy files to destination. + _copy_files( + src_dir=src_dir, + dst_dir=args.dst_dir, + files=_FILES_TO_COPY, + ) + + +if __name__ == "__main__": + _main(_parse()) diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_bash.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_bash.sh new file mode 100644 index 000000000..0025e81f4 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_bash.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# """ +# This script launches a Docker container with an interactive bash shell for +# development. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions from the project template. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List the available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" + +# Configure and run the Docker container with interactive bash shell. +# - Container is removed automatically on exit (--rm) +# - Interactive mode with TTY allocation (-ti) +# - Port forwarding for Jupyter or other services +# - Git root mounted to /git_root inside container +CONTAINER_NAME=${IMAGE_NAME}_bash +PORT= +DOCKER_CMD=$(get_docker_bash_command) +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME" diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.sh new file mode 100644 index 000000000..5b0957a99 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# """ +# Build a Docker container image for the project. +# +# This script sets up the build environment with error handling and command +# tracing, loads Docker configuration from docker_name.sh, and builds the +# Docker image using the build_container_image utility function. It supports +# both single-architecture and multi-architecture builds via the +# DOCKER_BUILD_MULTI_ARCH environment variable. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args are passed to the build. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME). +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Configure Docker build settings. +# Enable BuildKit for improved build performance and features. +export DOCKER_BUILDKIT=1 +#export DOCKER_BUILDKIT=0 + +# Configure single-architecture build (set to 1 for multi-arch build). +#export DOCKER_BUILD_MULTI_ARCH=1 +export DOCKER_BUILD_MULTI_ARCH=0 + +# Build the container image. +# Pass extra arguments (e.g., --no-cache) via command line after -v. +build_container_image "$@" diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.version.log b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.version.log new file mode 100644 index 000000000..8315eefe2 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_build.version.log @@ -0,0 +1 @@ +the input device is not a TTY diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_clean.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_clean.sh new file mode 100644 index 000000000..7e40839ae --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_clean.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# """ +# Remove Docker container image for the project. +# +# This script cleans up Docker images by removing the container image +# matching the project configuration. Useful for freeing disk space or +# ensuring a fresh build. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Remove the container image. +remove_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_cmd.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_cmd.sh new file mode 100644 index 000000000..906d7a77b --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_cmd.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# """ +# Execute a command in a Docker container. +# +# This script runs a specified command inside a new Docker container instance. +# The container is removed automatically after the command completes. The +# git root is mounted to /git_root inside the container. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args form the command. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Capture the command to execute from remaining arguments. +CMD="$@" +echo "Executing: '$CMD'" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" +#(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true + +# Configure and run the Docker container with the specified command. +CONTAINER_NAME=$IMAGE_NAME +DOCKER_CMD=$(get_docker_cmd_command) +PORT="" +DOCKER_RUN_OPTS="" +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT $DOCKER_RUN_OPTS) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME bash -c '$CMD'" diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_exec.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_exec.sh new file mode 100644 index 000000000..24f8e401a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_exec.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Execute a bash shell in a running Docker container. +# +# This script connects to an already running Docker container and opens an +# interactive bash session for debugging or inspection purposes. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Execute bash shell in the running container. +exec_container diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_jupyter.sh new file mode 100644 index 000000000..1a60dfd3a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_jupyter.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# """ +# Execute Jupyter Lab in a Docker container. +# +# This script launches a Docker container running Jupyter Lab with +# configurable port, directory mounting, and vim bindings. It passes +# command-line options to the run_jupyter.sh script inside the container. +# +# Usage: +# > docker_jupyter.sh [options] +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse command-line options and set Jupyter configuration variables. +parse_docker_jupyter_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images and inspect architecture. +list_and_inspect_docker_image + +# Run the Docker container with Jupyter Lab. +CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") +CONTAINER_NAME=$IMAGE_NAME +# Kill existing container if -f flag is set. +kill_existing_container_if_forced + +DOCKER_CMD=$(get_docker_jupyter_command) +DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_name.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_name.sh new file mode 100644 index 000000000..32a546cf3 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_name.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# """ +# Docker image naming configuration. +# +# This file defines the repository name, image name, and full image name +# variables used by all docker_*.sh scripts in the project template. +# """ + +REPO_NAME=gpsaggese +# The file should be all lower case. +IMAGE_NAME=umd_project_template +FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_push.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_push.sh new file mode 100644 index 000000000..27d752dd9 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/docker_push.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Push Docker container image to Docker Hub or registry. +# +# This script authenticates with the Docker registry using credentials from +# ~/.docker/passwd.$REPO_NAME.txt and pushes the locally built container +# image to the remote repository. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker image naming configuration. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source $SCRIPT_DIR/docker_name.sh + +# Push the container image to the registry. +push_container_image diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/etc_sudoers b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/etc_sudoers new file mode 100644 index 000000000..ee0816a15 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/etc_sudoers @@ -0,0 +1,31 @@ +# +# This file MUST be edited with the 'visudo' command as root. +# +# Please consider adding local content in /etc/sudoers.d/ instead of +# directly modifying this file. +# +# See the man page for details on how to write a sudoers file. +# +Defaults env_reset +Defaults mail_badpass +Defaults secure_path="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin" + +# Host alias specification + +# User alias specification + +# Cmnd alias specification + +# User privilege specification +root ALL=(ALL:ALL) ALL + +# Members of the admin group may gain root privileges +%admin ALL=(ALL) ALL + +# Allow members of group sudo to execute any command +%sudo ALL=(ALL:ALL) ALL + +# See sudoers(5) for more information on "#include" directives: +postgres ALL=(ALL) NOPASSWD:ALL + +#includedir /etc/sudoers.d diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/requirements.txt b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/requirements.txt new file mode 100644 index 000000000..49aca3901 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/requirements.txt @@ -0,0 +1,4 @@ +matplotlib +numpy +pandas +seaborn diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/run_jupyter.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/run_jupyter.sh new file mode 100644 index 000000000..d725c3fe7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/run_jupyter.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# """ +# Launch Jupyter Lab server. +# +# This script starts Jupyter Lab on port 8888 with the following configuration: +# - No browser auto-launch (useful for Docker containers) +# - Accessible from any IP address (0.0.0.0) +# - Root user allowed (required for Docker environments) +# - No authentication token or password (for development convenience) +# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Print each command to stdout before executing it. +#set -x + +# Import the utility functions from /git_root. +GIT_ROOT=/git_root +source $GIT_ROOT/class_project/project_template/utils.sh + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Setup Jupyter Lab environment. +setup_jupyter_environment + +# Initialize Jupyter Lab command with base configuration. +JUPYTER_ARGS=$(get_jupyter_args) + +# Start Jupyter Lab with development-friendly settings. +run "jupyter lab $JUPYTER_ARGS" diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.ipynb b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.ipynb new file mode 100644 index 000000000..3afca937c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "183c2248-ea3d-43ba-b87e-d821bba1bbc6", + "metadata": {}, + "source": [ + "# Template API Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.API.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "265e0d58-a7cd-4edf-a0b4-96b60220e801", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "id": "d3b2f997-5c9b-4238-b6d5-e5f2cea43809", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d1480ee9-d6a6-437d-b927-da6cbb05bdf5", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "markdown", + "id": "f9208cc9-837d-4fec-a312-9c4aa5b7648d", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a2d7a9c-c6c5-48c9-8445-11c97045d00b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-085a2ce7-6161-4c8a-92d5-492051832f3c.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "79c37ba3-bd5d-4a44-87df-645eee54977a", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a8a109cd-fc8e-4b9e-9dc0-4fc8d4126ad8", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "00926523-ae59-497d-bba8-b22e58333849", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.py b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.py new file mode 100644 index 000000000..4192ef8fe --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.API.py @@ -0,0 +1,129 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template API Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a neo4j tutorial the heading should be `Neo4j API`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.API.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.API.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% [markdown] +# ## Imports + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% [markdown] +# ## Configuration + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.ipynb b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.ipynb new file mode 100644 index 000000000..a2e9aedd7 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.ipynb @@ -0,0 +1,198 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "50f78f7e-2dee-45d6-9d37-7a55eeaae283", + "metadata": {}, + "source": [ + "# Template Example Notebook\n", + "\n", + "This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`.\n", + "\n", + "- Add description of what the notebook does.\n", + "- Point to references, e.g. (neo4j.example.md)\n", + "- Add citations.\n", + "- Keep the notebook flow clear.\n", + "- Comments should be imperative and have a period at the end.\n", + "- Your code should be well commented.\n", + "\n", + "The name of this notebook should in the following format:\n", + "- if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb`\n", + "\n", + "Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6226667e-cab5-479c-be6a-6b7d6f580a97", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8020901a-4bc7-4b73-95e8-aaa462b4fc19", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "# Import libraries in this section.\n", + "# Avoid imports like import *, from ... import ..., from ... import *, etc.\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hnotebook as hnotebo" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ecb72b2-b21d-4fb0-ac92-e7174da390e6", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0mWARNING: Running in Jupyter\n", + "INFO > cmd='/venv/lib/python3.12/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-783e0930-1631-4d64-8bb4-f3a98bb74fcd.json'\n" + ] + } + ], + "source": [ + "hdbg.init_logger(verbosity=logging.INFO)\n", + "\n", + "_LOG = logging.getLogger(__name__)\n", + "\n", + "hnotebo.config_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "1ede6422-bff2-4f0a-8d28-29a01d4786b2", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "## Make the notebook flow clear\n", + "Each notebook needs to follow a clear and logical flow, e.g:\n", + "- Load data\n", + "- Compute stats\n", + "- Clean data\n", + "- Compute stats\n", + "- Do analysis\n", + "- Show results\n", + "\n", + "\n", + "\n", + "\n", + "#############################################################################\n", + "Template\n", + "#############################################################################" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8bbd660d-d22f-44fa-bf53-dd622dee0f53", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class Template:\n", + " \"\"\"\n", + " Brief imperative description of what the class does in one line, if needed.\n", + " \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def method1(self, arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the method does in one line.\n", + "\n", + " You can elaborate more in the method docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every method/function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass\n", + "\n", + "\n", + "def template_function(arg1: int) -> None:\n", + " \"\"\"\n", + " Brief imperative description of what the function does in one line.\n", + "\n", + " You can elaborate more in the function docstring in this section, for e.g. explaining\n", + " the formula/algorithm. Every function should have a docstring, typehints and include the\n", + " parameters and return as follows:\n", + "\n", + " :param arg1: description of arg1\n", + " :return: description of return\n", + " \"\"\"\n", + " # Code bloks go here.\n", + " # Make sure to include comments to explain what the code is doing.\n", + " # No empty lines between code blocks.\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "103f6e36-54cf-442c-b137-8091d48805a7", + "metadata": {}, + "source": [ + "## The flow should be highlighted using headings in markdown\n", + "```\n", + "# Level 1\n", + "## Level 2\n", + "### Level 3\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d05d52af-67ba-4a4f-a561-af453e43854f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.py b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.py new file mode 100644 index 000000000..8566ff277 --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template.example.py @@ -0,0 +1,125 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template Example Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.example.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` + +# %% diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template_utils.py b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template_utils.py new file mode 100644 index 000000000..f8916102e --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/template_utils.py @@ -0,0 +1,72 @@ +""" +template_utils.py + +This file contains utility functions that support the tutorial notebooks. + +- Notebooks should call these functions instead of writing raw logic inline. +- This helps keep the notebooks clean, modular, and easier to debug. +- Students should implement functions here for data preprocessing, + model setup, evaluation, or any reusable logic. + +Import as: + +import class_project.project_template.template_utils as cpptteut +""" + +import pandas as pd +import logging +from sklearn.model_selection import train_test_split +from pycaret.classification import compare_models + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Example 1: Split the dataset into train and test sets +# ----------------------------------------------------------------------------- + + +def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2): + """ + Split the dataset into training and testing sets. + + :param df: full dataset + :param target_column: name of the target column + :param test_size: proportion of test data (default = 0.2) + + :return: X_train, X_test, y_train, y_test + """ + logger.info("Splitting data into train and test sets") + X = df.drop(columns=[target_column]) + y = df[target_column] + return train_test_split(X, y, test_size=test_size, random_state=42) + + +# ----------------------------------------------------------------------------- +# Example 2: PyCaret classification pipeline +# ----------------------------------------------------------------------------- + + +def run_pycaret_classification( + df: pd.DataFrame, target_column: str +) -> pd.DataFrame: + """ + Run a basic PyCaret classification experiment. + + :param df: dataset containing features and target + :param target_column: name of the target column + + :return: comparison of top-performing models + """ + logger.info("Initializing PyCaret classification setup") + ... + + logger.info("Comparing models") + results = compare_models() + ... + + return results diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/test/test_docker_all.py b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/test/test_docker_all.py new file mode 100644 index 000000000..904cdd7af --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/test/test_docker_all.py @@ -0,0 +1,48 @@ +""" +Run each notebook in class_project/project_template/ inside Docker using docker_cmd.sh. + +Import as: + +import class_project.project_template.test.test_docker_all as tptdal +""" + +import logging + +import pytest + +import helpers.hdocker_tests as hdoctest + +_LOG = logging.getLogger(__name__) + + +# ############################################################################# +# Test_docker +# ############################################################################# + + +class Test_docker(hdoctest.DockerTestCase): + """ + Run all Docker tests for class_project/project_template/. + """ + + _test_file = __file__ + + @pytest.mark.slow + def test1(self) -> None: + """ + Test that template.example.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.example.ipynb" + # Run test. + self._helper(notebook_name) + + @pytest.mark.slow + def test2(self) -> None: + """ + Test that template.API.ipynb runs without error inside Docker. + """ + # Prepare inputs. + notebook_name = "template.API.ipynb" + # Run test. + self._helper(notebook_name) diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/utils.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/utils.sh new file mode 100644 index 000000000..cc0ed8c4a --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/utils.sh @@ -0,0 +1,607 @@ +#!/bin/bash +# """ +# Utility functions for Docker container management. +# """ + + +# ############################################################################# +# General utilities +# ############################################################################# + + +run() { + # """ + # Execute a command with echo output. + # + # :param cmd: Command string to execute + # :return: Exit status of the executed command + # """ + cmd="$*" + echo "> $cmd" + eval "$cmd" +} + + +enable_verbose_mode() { + # """ + # Enable shell command tracing (set -x) when VERBOSE is set to 1. + # + # Reads the VERBOSE variable set by parse_docker_jupyter_args. + # Call this after parsing args to activate tracing for the rest of the script. + # """ + if [[ $VERBOSE == 1 ]]; then + set -x + fi +} + + +# ############################################################################# +# Argument parsing +# ############################################################################# + + +_print_default_help() { + # """ + # Print usage information and available default options for docker scripts. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -v Enable verbose output (set -x)" +} + + +parse_default_args() { + # """ + # Parse default command-line arguments for docker scripts. + # + # Sets VERBOSE and FORCE variables in the caller's scope. Enables set -x + # when -v is passed. Prints help and exits when -h is passed. + # Updates OPTIND so the caller can shift away processed arguments. + # + # :param @: command-line arguments forwarded from the calling script + # """ + VERBOSE=0 + FORCE=0 + while getopts "fhv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_default_help; exit 0;; + v) VERBOSE=1;; + *) _print_default_help; exit 1;; + esac + done + enable_verbose_mode +} + + +_print_docker_jupyter_help() { + # """ + # Print usage information and available options for docker_jupyter.sh. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Launch Jupyter Lab inside a Docker container." + echo "" + echo "Options:" + echo " -f Force kill existing container with same name before starting" + echo " -h Print this help message and exit" + echo " -p PORT Host port to forward to Jupyter Lab (default: 8888)" + echo " -u Enable vim keybindings in Jupyter Lab" + echo " -v Enable verbose output (set -x)" +} + + +parse_docker_jupyter_args() { + # """ + # Parse command-line arguments for docker_jupyter.sh. + # + # Sets JUPYTER_HOST_PORT, JUPYTER_USE_VIM, TARGET_DIR, VERBOSE, FORCE, and + # OLD_CMD_OPTS in the caller's scope. Enables set -x when -v is passed. + # Prints help and exits when -h is passed. + # + # :param @: command-line arguments forwarded from the calling script + # """ + # Set defaults. + JUPYTER_HOST_PORT=8888 + JUPYTER_USE_VIM=0 + VERBOSE=0 + FORCE=0 + # Save original args to pass through to run_jupyter.sh. + OLD_CMD_OPTS="$*" + # Parse options. + while getopts "fhp:uv" flag; do + case "${flag}" in + f) FORCE=1;; + h) _print_docker_jupyter_help; exit 0;; + p) JUPYTER_HOST_PORT=${OPTARG};; # Port for Jupyter Lab. + u) JUPYTER_USE_VIM=1;; # Enable vim bindings. + v) VERBOSE=1;; # Enable verbose output. + *) _print_docker_jupyter_help; exit 1;; + esac + done + # Enable command tracing if verbose mode is requested. + enable_verbose_mode +} + + +# ############################################################################# +# Docker image management +# ############################################################################# + + +get_docker_vars_script() { + # """ + # Load Docker variables from docker_name.sh script. + # + # :param script_path: Path to the script to determine the Docker configuration directory + # :return: Sources REPO_NAME, IMAGE_NAME, and FULL_IMAGE_NAME variables + # """ + local script_path=$1 + # Find the name of the container. + SCRIPT_DIR=$(dirname $script_path) + DOCKER_NAME="$SCRIPT_DIR/docker_name.sh" + if [[ ! -e $SCRIPT_DIR ]]; then + echo "Can't find $DOCKER_NAME" + exit -1 + fi; + source $DOCKER_NAME +} + + +print_docker_vars() { + # """ + # Print current Docker variables to stdout. + # """ + echo "REPO_NAME=$REPO_NAME" + echo "IMAGE_NAME=$IMAGE_NAME" + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" +} + + +build_container_image() { + # """ + # Build a Docker container image. + # + # Supports both single-architecture and multi-architecture builds. + # Creates temporary build directory, copies files, and builds the image. + # + # :param @: Additional options to pass to docker build/buildx build + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + # Prepare build area. + #tar -czh . | docker build $OPTS -t $IMAGE_NAME - + DIR="../tmp.build" + if [[ -d $DIR ]]; then + rm -rf $DIR + fi; + cp -Lr . $DIR || true + # Build container. + echo "DOCKER_BUILDKIT=$DOCKER_BUILDKIT" + echo "DOCKER_BUILD_MULTI_ARCH=$DOCKER_BUILD_MULTI_ARCH" + if [[ $DOCKER_BUILD_MULTI_ARCH != 1 ]]; then + # Build for a single architecture. + echo "Building for current architecture..." + OPTS="--progress plain $@" + (cd $DIR; docker build $OPTS -t $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + else + # Build for multiple architectures. + echo "Building for multiple architectures..." + OPTS="$@" + export DOCKER_CLI_EXPERIMENTAL=enabled + # Create a new builder. + #docker buildx rm --all-inactive --force + #docker buildx create --name mybuilder + #docker buildx use mybuilder + # Use the default builder. + docker buildx use multiarch + docker buildx inspect --bootstrap + # Note that one needs to push to the repo since otherwise it is not + # possible to keep multiple. + (cd $DIR; docker buildx build --push --platform linux/arm64,linux/amd64 $OPTS --tag $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + # Report the status. + docker buildx imagetools inspect $FULL_IMAGE_NAME + fi; + # Report build version. + if [ -f docker_build.version.log ]; then + rm docker_build.version.log + fi + (cd $DIR; docker run --rm -it -v $(pwd):/data $FULL_IMAGE_NAME bash -c "/data/version.sh") 2>&1 | tee docker_build.version.log + # + docker image ls $REPO_NAME/$IMAGE_NAME + rm -rf $DIR + echo "*****************************" + echo "SUCCESS" + echo "*****************************" +} + + +remove_container_image() { + # """ + # Remove Docker container image(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker image ls | grep $FULL_IMAGE_NAME + docker image ls | grep $FULL_IMAGE_NAME | awk '{print $1}' | xargs -n 1 -t docker image rm -f + docker image ls + echo "${FUNCNAME[0]} ... done" +} + + +push_container_image() { + # """ + # Push Docker container image to registry. + # + # Authenticates using credentials from ~/.docker/passwd.$REPO_NAME.txt. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker login --username $REPO_NAME --password-stdin <~/.docker/passwd.$REPO_NAME.txt + docker images $FULL_IMAGE_NAME + docker push $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +pull_container_image() { + # """ + # Pull Docker container image from registry. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker pull $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker container management +# ############################################################################# + + +kill_container() { + # """ + # Kill and remove Docker container(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + if [[ ! -z $CONTAINER_ID ]]; then + docker container rm -f $CONTAINER_ID + docker container ls + fi; + echo "${FUNCNAME[0]} ... done" +} + + +kill_container_by_name() { + # """ + # Kill and remove a Docker container by its name. + # + # :param container_name: Name of the container to kill + # """ + local container_name=$1 + echo "# ${FUNCNAME[0]}: $container_name" + # Check if container exists (running or stopped). + local container_id=$(docker container ls -a --filter "name=^${container_name}$" --format "{{.ID}}") + if [[ -n $container_id ]]; then + echo "Killing container: $container_name (ID: $container_id)" + docker container rm -f $container_id + else + echo "Container '$container_name' not found" + fi + echo "${FUNCNAME[0]} ... done" +} + + +exec_container() { + # """ + # Execute bash shell in running Docker container. + # + # Opens an interactive bash session in the first container matching the + # current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + docker exec -it $CONTAINER_ID bash + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker common options +# ############################################################################# + + +get_docker_common_options() { + # """ + # Return docker run options common to all container types. + # + # Includes volume mount for the git root, plus environment variables for + # PYTHONPATH and host OS name. + # + # :return: docker run options string with volume mounts and env vars + # """ + echo "-v $GIT_ROOT:/git_root \ + -e PYTHONPATH=/git_root:/git_root/helpers_root:/git_root/msml610/tutorials \ + -e CSFY_GIT_ROOT_PATH=/git_root \ + -e CSFY_HOST_OS_NAME=$(uname -s) \ + -e CSFY_HOST_NAME=$(uname -n)" +} + + +# ############################################################################# +# Docker bash +# ############################################################################# + + +get_docker_bash_command() { + # """ + # Return the base docker run command for an interactive bash shell. + # + # :return: docker run command string with --rm and -ti flags + # """ + if [ -t 0 ]; then + echo "docker run --rm -ti" + else + echo "docker run --rm -i" + fi +} + + +get_docker_bash_options() { + # """ + # Return docker run options for a Docker container. + # + # :param container_name: Name for the Docker container + # :param port: Port number to forward (optional, skipped if empty) + # :param extra_opts: Additional docker run options (optional) + # :return: docker run options string with name, volume mounts, and env vars + # """ + local container_name=$1 + local port=$2 + local extra_opts=$3 + local port_opt="" + if [[ -n $port ]]; then + port_opt="-p $port:$port" + fi + echo "--name $container_name \ + $port_opt \ + $extra_opts \ + $(get_docker_common_options)" +} + + +# ############################################################################# +# Docker cmd +# ############################################################################# + + +get_docker_cmd_command() { + # """ + # Return the base docker run command for executing a non-interactive command. + # + # :return: docker run command string with --rm and -i flags + # """ + echo "docker run --rm -i" +} + + +# ############################################################################# +# Docker Jupyter +# ############################################################################# + + +get_docker_jupyter_command() { + # """ + # Return the base docker run command for running Jupyter Lab interactively. + # + # :return: docker run command string with --rm and -ti flags (if TTY available) + # """ + local docker_cmd="docker run --rm" + # Add interactive and TTY flags only if stdin is a TTY. + if [[ -t 0 ]]; then + docker_cmd="$docker_cmd -ti" + fi + echo "$docker_cmd" +} + + +get_docker_jupyter_options() { + # """ + # Return docker run options for a Jupyter Lab container. + # + # :param container_name: Name for the Docker container + # :param host_port: Host port to forward to container port 8888 + # :param jupyter_use_vim: 0 or 1 to enable vim bindings + # :return: docker run options string + # """ + local container_name=$1 + local host_port=$2 + local jupyter_use_vim=$3 + # Run as the current user when user is saggese. + if [[ "$(whoami)" == "saggese" ]]; then + echo "Overwriting jupyter_use_vim since user='saggese'" >&2 + jupyter_use_vim=1 + fi + echo "--name $container_name \ + -p $host_port:8888 \ + $(get_docker_common_options) \ + -e JUPYTER_USE_VIM=$jupyter_use_vim" +} + + +configure_jupyter_vim_keybindings() { + # """ + # Configure JupyterLab vim keybindings based on JUPYTER_USE_VIM env var. + # + # Reads JUPYTER_USE_VIM; if 1, verifies jupyterlab_vim is installed and + # writes enabled settings; otherwise writes disabled settings. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@axlair/jupyterlab_vim + if [[ $JUPYTER_USE_VIM == 1 ]]; then + # Check that jupyterlab_vim is installed before trying to enable it. + if ! pip show jupyterlab_vim > /dev/null 2>&1; then + echo "ERROR: jupyterlab_vim is not installed but vim bindings were requested." + echo "Install it with: pip install jupyterlab_vim" + exit 1 + fi + echo "Enabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": true, + "enabledInEditors": true, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + else + echo "Disabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": false, + "enabledInEditors": false, + "extraKeybindings": [], + "autosaveInterval": 6 +} +EOF + fi; +} + + +configure_jupyter_notifications() { + # """ + # Disable JupyterLab news fetching and update checks. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/apputils-extension/notification.jupyterlab-settings +{ + // Notifications + // @jupyterlab/apputils-extension:notification + // Notifications settings. + + // Fetch official Jupyter news + // Whether to fetch news from the Jupyter news feed. If Always (`true`), it will make a request to a website. + "fetchNews": "false", + "checkForUpdates": false +} +EOF +} + + +configure_jupyter_autosave() { + # """ + # Configure JupyterLab global autosave interval to 6 seconds. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/docmanager-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/docmanager-extension/plugin.jupyterlab-settings +{ + "autosaveInterval": 6 +} +EOF +} + + +check_jupytext_installed() { + # """ + # Verify that jupytext is installed before starting Jupyter Lab. + # + # Jupytext is required for pair notebook/Python file functionality. + # Exits with error if jupytext is not installed. + # """ + if ! pip show jupytext > /dev/null 2>&1; then + echo "ERROR: jupytext is not installed but is required to run Jupyter Lab." + echo "Install it with: pip install jupytext" + exit 1 + fi +} + + +setup_jupyter_environment() { + # """ + # Configure Jupyter Lab environment before launching. + # + # Performs all necessary setup steps: + # - Configure vim keybindings + # - Disable notifications + # - Configure autosave interval + # - Verify jupytext is installed + # """ + configure_jupyter_vim_keybindings + configure_jupyter_notifications + configure_jupyter_autosave + check_jupytext_installed +} + + +get_jupyter_args() { + # """ + # Print the standard Jupyter Lab command-line arguments. + # + # :return: space-separated Jupyter Lab args for port 8888 with no browser, + # allow root, and no authentication + # """ + echo "--port=8888 --no-browser --ip=0.0.0.0 --allow-root --ServerApp.token='' --ServerApp.password=''" +} + + +get_run_jupyter_cmd() { + # """ + # Return the command to run run_jupyter.sh inside a container. + # + # Computes the script's path relative to GIT_ROOT and builds the + # corresponding /git_root/... path used inside the container. + # + # :param script_path: path of the calling script (pass ${BASH_SOURCE[0]}) + # :param cmd_opts: options to forward to run_jupyter.sh + # :return: full command string to run run_jupyter.sh + # """ + local script_path=$1 + local cmd_opts=$2 + local script_dir + script_dir=$(cd "$(dirname "$script_path")" && pwd) + local rel_dir="${script_dir#${GIT_ROOT}/}" + echo "/git_root/${rel_dir}/run_jupyter.sh $cmd_opts" +} + + +list_and_inspect_docker_image() { + # """ + # List available Docker images and inspect their architecture. + # + # Lists all images matching FULL_IMAGE_NAME and attempts to inspect + # their architecture using docker manifest inspect. + # """ + run "docker image ls $FULL_IMAGE_NAME" + (docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true +} + + +kill_existing_container_if_forced() { + # """ + # Kill existing container if FORCE flag is set. + # + # If FORCE is set to 1, kills and removes the container with name + # CONTAINER_NAME. This is typically set by the -f flag. + # """ + if [[ $FORCE == 1 ]]; then + kill_container_by_name $CONTAINER_NAME + fi +} diff --git a/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/version.sh b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/version.sh new file mode 100644 index 000000000..c46ed254c --- /dev/null +++ b/class_project/data605/Spring2026/projects/UmdTask458_DATA605_Spring2026_FastText_text_classification/version.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# """ +# Display versions of installed tools and packages. +# +# This script prints version information for Python, pip, Jupyter, and all +# installed Python packages. Used for debugging and documentation purposes +# to verify the Docker container environment setup. +# """ + +# Display Python 3 version. +echo "# Python3" +python3 --version + +# Display pip version. +echo "# pip3" +pip3 --version + +# Display Jupyter version. +echo "# jupyter" +jupyter --version + +# List all installed Python packages and their versions. +echo "# Python packages" +pip3 list + +# Template for adding additional tool versions. +# echo "# mongo" +# mongod --version