diff --git a/QuantizeLLMs.ipynb b/QuantizeLLMs.ipynb index a5acf46..1d531b8 100644 --- a/QuantizeLLMs.ipynb +++ b/QuantizeLLMs.ipynb @@ -3,11 +3,11 @@ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ - "\"Open" + "\"Open" ] }, { @@ -29,188 +29,190 @@ }, "outputs": [], "source": [ - "!cd llama.cpp && LLAMA_CUBLAS=1 make && pip install -r requirements.txt" + "!cd llama.cpp && GGML_CUDA=1 cmake -B build -DGGML_CUDA=ON && cmake --build build --config Release" ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "from huggingface_hub import snapshot_download" - ], + "!cd llama.cpp && pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "id": "HF6yYzNZtd19" }, - "execution_count": 3, - "outputs": [] + "outputs": [], + "source": [ + "from huggingface_hub import snapshot_download" + ] }, { "cell_type": "code", - "source": [ - "model_name = \"google/gemma-2b-it\"" - ], + "execution_count": null, "metadata": { "id": "vw5v0tF_t6qX" }, - "execution_count": 4, - "outputs": [] + "outputs": [], + "source": [ + "model_name = \"google/gemma-2b-it\"" + ] }, { "cell_type": "code", - "source": [ - "methods = [\"q4_k_m\"]" - ], + "execution_count": null, "metadata": { "id": "aq5DxDOiubm-" }, - "execution_count": 5, - "outputs": [] + "outputs": [], + "source": [ + "methods = [\"q4_k_m\"]" + ] }, { "cell_type": "code", - "source": [ - "base_model = \"./orignal_model/\"" - ], + "execution_count": null, "metadata": { "id": "yR27LvA_uwYm" }, - "execution_count": 6, - "outputs": [] + "outputs": [], + "source": [ + "base_model = \"./orignal_model/\"" + ] }, { "cell_type": "code", - "source": [ - "quantized_path = \"./quantized_model/\"" - ], + "execution_count": null, "metadata": { "id": "x4Ciwal7u5jx" }, - "execution_count": 7, - "outputs": [] + "outputs": [], + "source": [ + "quantized_path = \"./quantized_model/\"" + ] }, { "cell_type": "code", - "source": [ - "snapshot_download(repo_id=model_name, local_dir=base_model, local_dir_use_symlinks=False)" - ], + "execution_count": null, "metadata": { "id": "mUBryZ1wvSF6" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "snapshot_download(repo_id=model_name, local_dir=base_model)" + ] }, { "cell_type": "code", - "source": [ - "orignal_model = quantized_path+\"/fp16.gguf\"" - ], + "execution_count": null, "metadata": { "id": "5oFgQ1Pgw7eD" }, - "execution_count": 9, - "outputs": [] + "outputs": [], + "source": [ + "orignal_model = quantized_path+\"/fp16.gguf\"" + ] }, { "cell_type": "code", - "source": [ - "!mkdir ./quantized_model/" - ], + "execution_count": null, "metadata": { "id": "K9ANvmOtxLEg" }, - "execution_count": 10, - "outputs": [] + "outputs": [], + "source": [ + "!mkdir ./quantized_model/" + ] }, { "cell_type": "code", - "source": [ - "!python llama.cpp/convert-hf-to-gguf.py ./orignal_model/ --outtype f16 --outfile ./quantized_model/FP16.gguf" - ], + "execution_count": null, "metadata": { "id": "ue-353Tj2ZFv" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!python llama.cpp/convert_hf_to_gguf.py ./orignal_model/ --outtype f16 --outfile ./quantized_model/FP16.gguf" + ] }, { "cell_type": "code", - "source": [ - "import os" - ], + "execution_count": null, "metadata": { "id": "c7XTrFZh25ww" }, - "execution_count": 14, - "outputs": [] + "outputs": [], + "source": [ + "import os" + ] }, { "cell_type": "code", - "source": [ - "for m in methods:\n", - " qtype = f\"{quantized_path}/{m.upper()}.gguf\"\n", - " os.system(\"./llama.cpp/quantize \"+quantized_path+\"/FP16.gguf \"+qtype+\" \"+m)" - ], + "execution_count": null, "metadata": { "id": "vGU2XSuH4W4I" }, - "execution_count": 15, - "outputs": [] + "outputs": [], + "source": [ + "for m in methods:\n", + " qtype = f\"{quantized_path}/{m.upper()}.gguf\"\n", + " os.system(\"./llama.cpp/build/bin/llama-quantize \"+quantized_path+\"/FP16.gguf \"+qtype+\" \"+m)" + ] }, { "cell_type": "code", - "source": [ - "! ./llama.cpp/main -m ./quantized_model/Q4_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r \"User: \" -f llama.cpp/prompts/chat-with-bob.txt" - ], + "execution_count": null, "metadata": { "id": "oML4taGB6I6l" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "! ./llama.cpp/build/bin/llama-cli -m ./quantized_model/Q4_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r \"User: \" -f llama.cpp/prompts/chat-with-bob.txt" + ] }, { "cell_type": "code", - "source": [ - "from huggingface_hub import HfApi, HfFolder, create_repo, upload_file" - ], + "execution_count": null, "metadata": { "id": "C3gXkTXLAetI" }, - "execution_count": 18, - "outputs": [] + "outputs": [], + "source": [ + "from huggingface_hub import HfApi, HfFolder, create_repo, upload_file" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kiMSy2PuAwG2" + }, + "outputs": [], "source": [ "model_path = \"./quantized_model/Q4_K_M.gguf\"\n", "repo_name = \"gemma-2b-it-GGUF-quantized\"\n", "repo_url = create_repo(repo_name, private=False)" - ], - "metadata": { - "id": "kiMSy2PuAwG2" - }, - "execution_count": 19, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "api = HfApi()" - ], + "execution_count": null, "metadata": { "id": "gOfEfRgbB47f" }, - "execution_count": 20, - "outputs": [] + "outputs": [], + "source": [ + "api = HfApi()" + ] }, { "cell_type": "code", - "source": [ - "api.upload_file(\n", - " path_or_fileobj=model_path,\n", - " path_in_repo=\"Q4_K_M.gguf\",\n", - " repo_id= \"yourusername/gemma-2b-it-GGUF-quantized\",\n", - " repo_type=\"model\",\n", - ")" - ], + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -232,400 +234,43 @@ "id": "DVd1C5IxB_eG", "outputId": "e82bac0b-8ff8-4753-f451-787114ee5d94" }, - "execution_count": 21, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Q4_K_M.gguf: 0%| | 0.00/1.63G [00:00