diff --git a/src/homeworks/homework8/texts.ipynb b/src/homeworks/homework8/texts.ipynb
new file mode 100644
index 0000000..64d2494
--- /dev/null
+++ b/src/homeworks/homework8/texts.ipynb
@@ -0,0 +1,1085 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WYC9jXnTRvW3"
+      },
+      "source": [
+        "# Работа с текстом\n",
+        "\n",
+        "В этом домашнем задании вам предстоит поработать с текстовыми данными и научиться находить спам сообщения!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "gmljLzJDRvW4"
+      },
+      "outputs": [],
+      "source": [
+        "import inspect\n",
+        "import math\n",
+        "import random\n",
+        "import re\n",
+        "from collections import Counter, defaultdict\n",
+        "from string import punctuation\n",
+        "\n",
+        "import numpy as np\n",
+        "from nltk import SnowballStemmer, download\n",
+        "from nltk.corpus import stopwords\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from sklearn.metrics import classification_report, accuracy_score\n",
+        "from sklearn.model_selection import KFold, train_test_split\n",
+        "from scipy.special import logsumexp"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "nllPeX1xACLr"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[nltk_data] Downloading package stopwords to /home/sashka/nltk_data...\n",
+            "[nltk_data]   Package stopwords is already up-to-date!\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "download(\"stopwords\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "XTU13-rOACLr",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [],
+      "source": [
+        "def set_seed(seed=42):\n",
+        "    np.random.seed(seed)\n",
+        "    random.seed(seed)\n",
+        "\n",
+        "\n",
+        "# Этой функцией будут помечены все места, которые необходимо дозаполнить\n",
+        "# Это могут быть как целые функции, так и отдельные части внутри них\n",
+        "# Всегда можно воспользоваться интроспекцией и найти места использования этой функции :)\n",
+        "def todo():\n",
+        "    stack = inspect.stack()\n",
+        "    caller_frame = stack[1]\n",
+        "    function_name = caller_frame.function\n",
+        "    line_number = caller_frame.lineno\n",
+        "    raise NotImplementedError(f\"TODO at {function_name}, line {line_number}\")\n",
+        "\n",
+        "\n",
+        "SEED = 0xC0FFEE\n",
+        "set_seed(SEED)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "VRJVvs51RvW4"
+      },
+      "outputs": [],
+      "source": [
+        "def read_dataset(filename):\n",
+        "    x, y = [], []\n",
+        "    with open(filename, encoding=\"utf-8\") as file:\n",
+        "        for line in file:\n",
+        "            cl, sms = re.split(r\"^(ham|spam)[\\t\\s]+(.*)$\", line)[1:3]\n",
+        "            x.append(sms)\n",
+        "            y.append(cl)\n",
+        "    return x, y"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "9YKoYXwsRvW5"
+      },
+      "outputs": [],
+      "source": [
+        "X, y = read_dataset(\"src/homeworks/homework8/spam.txt\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "GCmIbwv6RvW5"
+      },
+      "outputs": [],
+      "source": [
+        "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=SEED, stratify=y)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "Isg1F2ClACLt",
+        "pycharm": {
+          "name": "#%%\n"
+        }
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "ham: Two fundamentals of cool life: \"Walk, like you are the KING\"...! OR \"Walk like you Dont care,whoever is the KING\"!... Gud nyt\n",
+            "ham: Haha... Where got so fast lose weight, thk muz go 4 a month den got effect... Gee,later we go aust put bk e weight.\n",
+            "ham: I wish things were different. I wonder when i will be able to show you how much i value you. Pls continue the brisk walks no drugs without askin me please and find things to laugh about. I love you dearly.\n",
+            "ham: Tmr then ü brin lar... Aiya later i come n c lar... Mayb ü neva set properly ü got da help sheet wif ü...\n",
+            "ham: For many things its an antibiotic and it can be used for chest abdomen and gynae infections even bone infections.\n"
+          ]
+        }
+      ],
+      "source": [
+        "for x_, y_ in zip(X_train[:5], y_train[:5]):\n",
+        "    print(f\"{y_}: {x_}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "kX5UHxOiACLu"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "Counter({'ham': 4344, 'spam': 672})"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "Counter(y_train)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HzAcgm1rACLu"
+      },
+      "source": [
+        "## Bag of Words (2 балла)\n",
+        "\n",
+        "Реализуйте простой подсчет слов в тексте, в качестве токенизатора делите по пробелу, убрав перед этим все знаки пунктуации и приведя к нижнему регистру.\n",
+        "\n",
+        "После этого обучите простую логистическую модель, измерьте ее качество и сделайте выводы."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "lZsEPDLBACLu"
+      },
+      "outputs": [],
+      "source": [
+        "class BagOfWords:\n",
+        "    def __init__(self, vocabulary_size: int = 1000):\n",
+        "        \"\"\"Init Bag-of-Words instance\n",
+        "\n",
+        "        Args:\n",
+        "            vocabulary_size: maximum number of tokens in vocabulary\n",
+        "        \"\"\"\n",
+        "        \n",
+        "        self._vocabulary_size = vocabulary_size\n",
+        "        self._vocabulary: dict[str, int] = None\n",
+        "\n",
+        "    def _tokenize(self, sentence: str) -> list[str]:\n",
+        "        sentence = sentence.lower()\n",
+        "        translator = str.maketrans('', '', punctuation)\n",
+        "        sentence = sentence.translate(translator)\n",
+        "        tokens = sentence.split()\n",
+        "        \n",
+        "        return [token for token in tokens if token]\n",
+        "\n",
+        "    def fit(self, sentences: list[str]):\n",
+        "        \"\"\"Fit Bag-of-Words based on list of sentences\"\"\"\n",
+        "\n",
+        "        all_tokens = []\n",
+        "\n",
+        "        for sentence in sentences:\n",
+        "            all_tokens.extend(self._tokenize(sentence))\n",
+        "\n",
+        "        token_counts = Counter(all_tokens)\n",
+        "        most_common_tokens = token_counts.most_common(self._vocabulary_size)\n",
+        "\n",
+        "        self._vocabulary = {token: i for i, (token, _) in enumerate(most_common_tokens)}\n",
+        "\n",
+        "    def transform(self, sentences: list[str]) -> np.ndarray:\n",
+        "        \"\"\"Vectorize texts using built vocabulary\n",
+        "\n",
+        "        Args:\n",
+        "            sentences: list of sentences to vectorize\n",
+        "\n",
+        "        Return:\n",
+        "            transformed texts, matrix of (n_sentences, vocab_size)\n",
+        "        \"\"\"\n",
+        "\n",
+        "        if self._vocabulary is None:\n",
+        "            raise RuntimeError(\"Fit before transforming!\")\n",
+        "        \n",
+        "        num_features = len(self._vocabulary)\n",
+        "\n",
+        "        bow_matrix = np.zeros((len(sentences), num_features), dtype=int)\n",
+        "\n",
+        "        for i, sentence in enumerate(sentences):\n",
+        "            tokens = self._tokenize(sentence)\n",
+        "            for token in tokens:\n",
+        "                if token in self._vocabulary:\n",
+        "                    token_idx = self._vocabulary[token]\n",
+        "                    bow_matrix[i, token_idx] += 1\n",
+        "                    \n",
+        "        return bow_matrix\n",
+        "\n",
+        "    def fit_transform(self, sentences: list[str]) -> np.ndarray:\n",
+        "        self.fit(sentences)\n",
+        "        return self.transform(sentences)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 29,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_accuracy(bow_model, size: int, param: dict, model) -> int:\n",
+        "    bow = bow_model(vocabulary_size=size, **param)\n",
+        "    \n",
+        "    X_train_search, X_val_search, y_train_search, y_val_search = train_test_split(X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train)\n",
+        "\n",
+        "    X_train_bow = bow.fit_transform(X_train_search)\n",
+        "    X_val_bow = bow.transform(X_val_search)\n",
+        "\n",
+        "    model = model()\n",
+        "    model.fit(X_train_bow, y_train_search)\n",
+        "\n",
+        "    y_pred = model.predict(X_val_bow)\n",
+        "    return accuracy_score(y_val_search, y_pred)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 30,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def get_best_param(bow_model, params: list[dict] = None, model=LogisticRegression) -> dict:\n",
+        "    sizes = range(1, 3500, 100)\n",
+        "    best_accuracy = -1\n",
+        "    best_params = {}\n",
+        "\n",
+        "    if params is None:\n",
+        "        params = [{}]\n",
+        "\n",
+        "    for size in sizes:\n",
+        "        for param in params:\n",
+        "            cur_accuracy = get_accuracy(bow_model, size, param, model)\n",
+        "            \n",
+        "            if cur_accuracy > best_accuracy:\n",
+        "                best_accuracy = cur_accuracy\n",
+        "                best_params = {\"vocabulary_size\": size, **param}\n",
+        "    \n",
+        "    return best_params"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "id": "VuSY9FEARvW5"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "((5016, 701), (558, 701))"
+            ]
+          },
+          "execution_count": 12,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "bow_best_param = get_best_param(BagOfWords)\n",
+        "\n",
+        "bow = BagOfWords(**bow_best_param)\n",
+        "X_train_bow = bow.fit_transform(X_train)\n",
+        "X_test_bow = bow.transform(X_test)\n",
+        "\n",
+        "X_train_bow.shape, X_test_bow.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'vocabulary_size': 701}"
+            ]
+          },
+          "execution_count": 13,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "bow_best_param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "_cKMLYwMACLv"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "         ham       0.99      1.00      0.99       483\n",
+            "        spam       0.99      0.93      0.96        75\n",
+            "\n",
+            "    accuracy                           0.99       558\n",
+            "   macro avg       0.99      0.97      0.98       558\n",
+            "weighted avg       0.99      0.99      0.99       558\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "model = LogisticRegression()\n",
+        "model.fit(X_train_bow, y_train)\n",
+        "\n",
+        "y_pred = model.predict(X_test_bow)\n",
+        "print(classification_report(y_test, y_pred))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Оптимальный размер словаря: 701\n",
+        "\n",
+        "Метрики при нем достаточно высокие: ham определяется почти идеально. Для spam относительно низкое значение recall: модель пропускает spam."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KI089XBzACLv"
+      },
+      "source": [
+        "## Обработка текста (1 балл)\n",
+        "\n",
+        "Добавьте на этапе токенизатора удаление стоп-слов и стемминг, для этого можно воспользоваться [`SnowballStemmer`](https://www.nltk.org/api/nltk.stem.SnowballStemmer.html) из библиотеки `nltk`.\n",
+        "\n",
+        "⚠️ `nltk` уже довольно устаревшая библиотека и скорее не рекомендуется ее использовать, однако в учебных целях более чем достаточно.\n",
+        "\n",
+        "Обучите логистическую регрессию, попробуйте по-разному комбинировать стемминг и удаление стоп-слов, сделайте выводы."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "id": "yttB8gqVACLv"
+      },
+      "outputs": [],
+      "source": [
+        "class BagOfWordsStem(BagOfWords):\n",
+        "    def __init__(\n",
+        "        self,\n",
+        "        vocabulary_size: int,\n",
+        "        language: str = \"english\",\n",
+        "        ignore_stopwords: bool = True,\n",
+        "        remove_stopwords: bool = True,\n",
+        "    ):\n",
+        "        super().__init__(vocabulary_size)\n",
+        "\n",
+        "        if remove_stopwords and not ignore_stopwords:\n",
+        "            raise ValueError(\"To remove stop-words they should be ignored by stemmer\")\n",
+        "        \n",
+        "        self._stemmer = SnowballStemmer(language)\n",
+        "        self._stopwords = set(stopwords.words(language))\n",
+        "        \n",
+        "        self._remove_stopwords = remove_stopwords\n",
+        "        self._ignore_stopwords = ignore_stopwords\n",
+        "\n",
+        "    def _tokenize(self, sentence: str) -> list[str]:\n",
+        "        tokens = super()._tokenize(sentence)\n",
+        "\n",
+        "        processed_tokens = []\n",
+        "\n",
+        "        for token in tokens:\n",
+        "            is_stopword = token in self._stopwords\n",
+        "\n",
+        "            if is_stopword and self._remove_stopwords:\n",
+        "                continue \n",
+        "            \n",
+        "            if is_stopword and self._ignore_stopwords:\n",
+        "                processed_tokens.append(token)\n",
+        "            else:\n",
+        "                processed_tokens.append(self._stemmer.stem(token))\n",
+        "        \n",
+        "        return processed_tokens"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "7ROhMn0bACLv"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "((5016, 401), (558, 401))"
+            ]
+          },
+          "execution_count": 16,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "params = [{\"remove_stopwords\": True, \"ignore_stopwords\": True},\n",
+        "          {\"remove_stopwords\": False, \"ignore_stopwords\": True},\n",
+        "          {\"remove_stopwords\": False, \"ignore_stopwords\": False}]\n",
+        "\n",
+        "bows_best_param = get_best_param(BagOfWordsStem, params)\n",
+        "\n",
+        "bow = BagOfWordsStem(**bows_best_param)\n",
+        "X_train_bow = bow.fit_transform(X_train)\n",
+        "X_test_bow = bow.transform(X_test)\n",
+        "\n",
+        "X_train_bow.shape, X_test_bow.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'vocabulary_size': 401, 'remove_stopwords': False, 'ignore_stopwords': True}"
+            ]
+          },
+          "execution_count": 17,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "bows_best_param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 18,
+      "metadata": {
+        "id": "yUU-BcQ1ACLv"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "         ham       0.99      1.00      0.99       483\n",
+            "        spam       0.99      0.93      0.96        75\n",
+            "\n",
+            "    accuracy                           0.99       558\n",
+            "   macro avg       0.99      0.97      0.98       558\n",
+            "weighted avg       0.99      0.99      0.99       558\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "model = LogisticRegression()\n",
+        "model.fit(X_train_bow, y_train)\n",
+        "\n",
+        "y_pred = model.predict(X_test_bow)\n",
+        "print(classification_report(y_test, y_pred))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Оптимальный размер словаря: 401, remove_stopwords: False, ignore_stopwords: True\n",
+        "\n",
+        "Лучшее значение accuracy достигается при неудалении стоп-слов и их игнорировании стеммером. То есть стоп-слова сохраняются в их изначальной форме. Оптимальный размер словаря ожидаемо меньше, чем при методе без стемминга.\n",
+        "\n",
+        "Метрики не изменились, значит достигли такой же точности с меньшим словарем."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2_IOo98iACLw"
+      },
+      "source": [
+        "## TF-IDF (2 балла)\n",
+        "\n",
+        "Доработайте предыдущий класс до полноценного Tf-Idf, затем, аналогично, проведите эксперименты с логистической регрессией."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 19,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "class Tokenizer:\n",
+        "    def __init__(\n",
+        "        self,\n",
+        "        vocabulary_size: int,\n",
+        "        language: str = \"english\",\n",
+        "        ignore_stopwords: bool = True,\n",
+        "        remove_stopwords: bool = True,\n",
+        "    ):\n",
+        "        self._vocabulary_size = vocabulary_size\n",
+        "        self._vocabulary: dict[str, int] = None\n",
+        "\n",
+        "        if remove_stopwords and not ignore_stopwords:\n",
+        "            raise ValueError(\"To remove stop-words they should be ignored by stemmer\")\n",
+        "        \n",
+        "        self._stemmer = SnowballStemmer(language)\n",
+        "        self._stopwords = set(stopwords.words(language))\n",
+        "        \n",
+        "        self._remove_stopwords = remove_stopwords\n",
+        "        self._ignore_stopwords = ignore_stopwords\n",
+        "    \n",
+        "    def tokenize(self, sentence: str) -> list[str]:\n",
+        "        sentence = sentence.lower()\n",
+        "        translator = str.maketrans('', '', punctuation)\n",
+        "        sentence = sentence.translate(translator)\n",
+        "        tokens = sentence.split()\n",
+        "        \n",
+        "        tokens = [token for token in tokens if token]\n",
+        "        processed_tokens = []\n",
+        "\n",
+        "        for token in tokens:\n",
+        "            is_stopword = token in self._stopwords\n",
+        "\n",
+        "            if is_stopword and self._remove_stopwords:\n",
+        "                continue \n",
+        "            \n",
+        "            if is_stopword and self._ignore_stopwords:\n",
+        "                processed_tokens.append(token)\n",
+        "            else:\n",
+        "                processed_tokens.append(self._stemmer.stem(token))\n",
+        "        \n",
+        "        return processed_tokens"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 20,
+      "metadata": {
+        "id": "6rzQ5sUOACLw"
+      },
+      "outputs": [],
+      "source": [
+        "class TFIDFVectorizer:\n",
+        "    def __init__(\n",
+        "        self,\n",
+        "        vocabulary_size: int,\n",
+        "        language: str = \"english\",\n",
+        "        ignore_stopwords: bool = True,\n",
+        "        remove_stopwords: bool = True,\n",
+        "        use_idf: bool = False,\n",
+        "    ):\n",
+        "        self._vocabulary_size = vocabulary_size\n",
+        "        self._vocabulary = None\n",
+        "        self._idf = None\n",
+        "        self._use_idf = use_idf\n",
+        "\n",
+        "        self._tokenizer = Tokenizer(vocabulary_size, language, ignore_stopwords, remove_stopwords)\n",
+        "\n",
+        "    def _tokenize(self, sentence: str) -> list[str]:\n",
+        "        return self._tokenizer.tokenize(sentence)\n",
+        "\n",
+        "    def fit(self, sentences: list[str]):\n",
+        "        \"\"\"Build vocabulary and compute IDF\"\"\"\n",
+        "        \n",
+        "        all_tokens = []\n",
+        "        document_frequency = defaultdict(int)\n",
+        "\n",
+        "        for sentence in sentences:\n",
+        "            tokens = self._tokenize(sentence)\n",
+        "            all_tokens.extend(tokens)\n",
+        "            \n",
+        "            for token in set(tokens):\n",
+        "                document_frequency[token] += 1\n",
+        "\n",
+        "        token_counts = Counter(all_tokens)\n",
+        "        most_common_tokens = token_counts.most_common(self._vocabulary_size)\n",
+        "\n",
+        "        self._vocabulary = {token: i for i, (token, _) in enumerate(most_common_tokens)}\n",
+        "\n",
+        "        n_sentences = len(sentences)\n",
+        "        \n",
+        "        if self._use_idf:\n",
+        "            self._idf = np.zeros(len(self._vocabulary))\n",
+        "            for token, i in self._vocabulary.items():\n",
+        "                n_w = document_frequency.get(token, 0)\n",
+        "                self._idf[i] = np.log(n_sentences / (n_w + 1.0)) + 1.0\n",
+        "        else:\n",
+        "            self._idf = np.ones(len(self._vocabulary))\n",
+        "\n",
+        "    def transform(self, sentences: list[str]) -> np.ndarray:\n",
+        "        \"\"\"Transform sentences to TF-IDF vectors\"\"\"\n",
+        "\n",
+        "        n_sentences = len(sentences)\n",
+        "        n_features = len(self._vocabulary)\n",
+        "        tfidf_matrix = np.zeros((n_sentences, n_features), dtype=float)\n",
+        "\n",
+        "        for i, sentence in enumerate(sentences):\n",
+        "            tokens = self._tokenize(sentence)\n",
+        "            if not tokens:\n",
+        "                continue\n",
+        "            \n",
+        "            sentence_len = len(tokens)\n",
+        "            token_counts_in_sentence = Counter(tokens)\n",
+        "\n",
+        "            for token, count in token_counts_in_sentence.items():\n",
+        "                if token in self._vocabulary:\n",
+        "                    token_idx = self._vocabulary[token]\n",
+        "                    \n",
+        "                    if self._use_idf:\n",
+        "                        tfidf_matrix[i, token_idx] = count / sentence_len * self._idf[token_idx]\n",
+        "                    else:\n",
+        "                        tfidf_matrix[i, token_idx] = count\n",
+        "                    \n",
+        "        return tfidf_matrix\n",
+        "\n",
+        "    def fit_transform(self, sentences: list[str]) -> np.ndarray:\n",
+        "        self.fit(sentences)\n",
+        "        return self.transform(sentences)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 21,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "((5016, 701), (558, 701))"
+            ]
+          },
+          "execution_count": 21,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "params = [\n",
+        "    {\"remove_stopwords\": True, \"ignore_stopwords\": True, \"use_idf\": True},\n",
+        "    {\"remove_stopwords\": False, \"ignore_stopwords\": True, \"use_idf\": True},\n",
+        "    {\"remove_stopwords\": False, \"ignore_stopwords\": False, \"use_idf\": True}\n",
+        "]\n",
+        "tfidf_best_param = get_best_param(TFIDFVectorizer, params)\n",
+        "\n",
+        "tfidf = TFIDFVectorizer(**tfidf_best_param)\n",
+        "X_train_tfidf = tfidf.fit_transform(X_train)\n",
+        "X_test_tfidf = tfidf.transform(X_test)\n",
+        "\n",
+        "X_train_tfidf.shape, X_test_tfidf.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 22,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'vocabulary_size': 701,\n",
+              " 'remove_stopwords': False,\n",
+              " 'ignore_stopwords': True,\n",
+              " 'use_idf': True}"
+            ]
+          },
+          "execution_count": 22,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "tfidf_best_param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "tfidf = TFIDFVectorizer(**tfidf_best_param)\n",
+        "X_train_tfidf = tfidf.fit_transform(X_train)\n",
+        "X_test_tfidf = tfidf.transform(X_test)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 24,
+      "metadata": {
+        "id": "sg_Gac-jACLw"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "         ham       0.97      1.00      0.98       483\n",
+            "        spam       0.97      0.83      0.89        75\n",
+            "\n",
+            "    accuracy                           0.97       558\n",
+            "   macro avg       0.97      0.91      0.94       558\n",
+            "weighted avg       0.97      0.97      0.97       558\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "model = LogisticRegression()\n",
+        "model.fit(X_train_tfidf, y_train)\n",
+        "\n",
+        "y_pred = model.predict(X_test_tfidf)\n",
+        "print(classification_report(y_test, y_pred))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "При использовании idf:\n",
+        "\n",
+        "Оптимальный размер словаря: 701, remove_stopwords: False, ignore_stopwords: True\n",
+        "\n",
+        "Лучшее значение accuracy так же как и для BagOfWordsStem достигается при неудалении стоп-слов и их игнорировании стеммером.\n",
+        "\n",
+        "Метрики в целом снизились. Скорее всего это связано с тем, что слова, которые часто встречаются в спаме (низкий IDF), являются сильными идентификаторами спама, а TF-IDF снижает их занчимость."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oJe5cDxDRvW5"
+      },
+      "source": [
+        "## NaiveBayes (5 баллов)\n",
+        "\n",
+        "Наивный байесовский классификатор — это простой и эффективный алгоритм машинного обучения, основанный на теореме Байеса с наивным предположением независимости признаков.\n",
+        "\n",
+        "### Формула Байеса\n",
+        "\n",
+        "$$\n",
+        "P(A|B) = \\frac{P(B|A) \\cdot P(A)}{P(B)}\n",
+        "$$\n",
+        "\n",
+        "В контексте классификации текста это значит: $P(класс | документ) \\propto P(класс) \\cdot P(документ | класс)$\n",
+        "\n",
+        "Почему \"наивность\"? Потому что предпологаем, что все слова независимы:\n",
+        "\n",
+        "$$\n",
+        "P(w_1, w_2, \\dots | class) = P(w_1 | class) \\cdot P(w_2 | class) \\cdot \\dots\n",
+        "$$\n",
+        "\n",
+        "### Классификация текста\n",
+        "\n",
+        "Таким образом, для классификации текста необходимо:\n",
+        "\n",
+        "1. Вычислить априорную вероятность класса: $P(class)$, доля документов с таким классом\n",
+        "2. Вычислить правдоподобие: $P(text | class) = \\prod_{i=1}^n P(w_i | class)$\n",
+        "\n",
+        "_Примечание:_ $P(w_i | class)$ — это частота слова в данном классе относительно всех слов в классе, при этом зачастую добавляют сглаживание Лапласа в качестве регуляризатора\n",
+        "$$\n",
+        "P(w_i | class) = \\frac{\\text{частота слова в классе} + \\alpha}{\\text{сумма всех слов в классе} + \\alpha \\cdot |V|}\n",
+        "$$\n",
+        "\n",
+        "После этого, необходимо выбрать наиболее вероятный класс для данного текста:\n",
+        "\n",
+        "$$\n",
+        "class = \\arg \\max\\limits_{c} \\Big[ P(c) \\cdot P(text | c) \\Big] = \\arg \\max\\limits_{c} \\Big[ \\log P(c) + \\sum_{i=1}^n \\log P(w_i | c) \\Big]\n",
+        "$$\n",
+        "\n",
+        "### Реализация\n",
+        "\n",
+        "`fit(X, y)` - оценивает параметры распределения `p(x|y)` для каждого `y`.\n",
+        "\n",
+        "`log_proba(X)` - для каждого элемента набора `X` считает логарифм вероятности отнести его к каждому классу."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 25,
+      "metadata": {
+        "id": "cQL-8wxwRvW5"
+      },
+      "outputs": [],
+      "source": [
+        "class NaiveBayes:\n",
+        "\n",
+        "    def __init__(self, alpha: float = 1.0):\n",
+        "        \"\"\"\n",
+        "        Args:\n",
+        "            alpha: regularization coefficient\n",
+        "        \"\"\"\n",
+        "        self.alpha = alpha\n",
+        "        self._classes = None  # [n classes]\n",
+        "        self._vocab_size = None  # int\n",
+        "        self._log_p_y = None  # [n classes]\n",
+        "        self._log_p_x_y = None  # [n classes, vocab size]\n",
+        "\n",
+        "    def fit(self, features: np.ndarray, targets: list[str]):\n",
+        "        \"\"\"Estimate p(x|y) and p(y) based on data\n",
+        "\n",
+        "        Args:\n",
+        "            features, [n samples; vocab size]: input features\n",
+        "            targets, [n samples]: targets\n",
+        "        \"\"\"\n",
+        "        targets = np.array(targets)\n",
+        "        \n",
+        "        self._classes = np.unique(targets)\n",
+        "        n_classes = len(self._classes)\n",
+        "        n_samples, self._vocab_size = features.shape\n",
+        "\n",
+        "        self._log_p_y = np.zeros(n_classes, dtype=np.float64)\n",
+        "        self._log_p_x_y = np.zeros((n_classes, self._vocab_size), dtype=np.float64)\n",
+        "\n",
+        "        for i, cls in enumerate(self._classes):\n",
+        "            features_cls = features[targets == cls]\n",
+        "\n",
+        "            n_samples_in_class = features_cls.shape[0]\n",
+        "            \n",
+        "            if n_samples_in_class == 0:\n",
+        "                self._log_p_y[i] = -np.inf\n",
+        "            else:\n",
+        "                self._log_p_y[i] = np.log(n_samples_in_class / n_samples)\n",
+        "\n",
+        "            feature_counts_in_class = np.sum(features_cls, axis=0)\n",
+        "            total_features_in_class = np.sum(feature_counts_in_class)\n",
+        "\n",
+        "            numerator = feature_counts_in_class + self.alpha\n",
+        "            denominator = total_features_in_class + self.alpha * self._vocab_size\n",
+        "\n",
+        "            if denominator == 0:\n",
+        "                self._log_p_x_y[i, :] = -np.inf\n",
+        "            else:\n",
+        "                self._log_p_x_y[i, :] = np.log(numerator / denominator)\n",
+        "\n",
+        "\n",
+        "    def predict(self, features: np.ndarray) -> np.ndarray:\n",
+        "        \"\"\"Predict class for each sample\n",
+        "\n",
+        "        Args:\n",
+        "            features, [n samples; vocab size]: feature to predict\n",
+        "        Return:\n",
+        "            classes, [n samples]: predicted class\n",
+        "        \"\"\"\n",
+        "        log_probabilities = self.log_proba(features)\n",
+        "        predicted_class_indices = np.argmax(log_probabilities, axis=1)\n",
+        "        predicted_classes = self._classes[predicted_class_indices]\n",
+        "        return predicted_classes\n",
+        "\n",
+        "    def log_proba(self, features: np.ndarray) -> np.ndarray:\n",
+        "        \"\"\"Calculate p(y|x) for each class and each sample\n",
+        "\n",
+        "        Args:\n",
+        "            features, [n samples; vocab size]: feature to predict\n",
+        "        Return:\n",
+        "            classes, [n samples;  n classes]: log proba for each class\n",
+        "        \"\"\"\n",
+        "        if self._vocab_size is None:\n",
+        "            raise RuntimeError(\"Fit classifier before predicting something\")\n",
+        "        if features.shape[1] != self._vocab_size:\n",
+        "            raise RuntimeError(\n",
+        "                f\"Unexpected size of vocabulary, expected {self._vocab_size}, actual {features.shape[1]}\"\n",
+        "            )\n",
+        "\n",
+        "        n_samples = features.shape[0]\n",
+        "        log_probabilities = (features @ self._log_p_x_y.T) + self._log_p_y[np.newaxis, :]\n",
+        "        log_p_x = logsumexp(log_probabilities, axis=1, keepdims=True)\n",
+        "        log_posterior_proba = log_probabilities - log_p_x\n",
+        "        return log_posterior_proba"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 34,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "params = [{\"remove_stopwords\": True, \"ignore_stopwords\": True},\n",
+        "          {\"remove_stopwords\": False, \"ignore_stopwords\": True},\n",
+        "          {\"remove_stopwords\": False, \"ignore_stopwords\": False}]\n",
+        "\n",
+        "bayes_bows_best_param = get_best_param(BagOfWordsStem, params, model=NaiveBayes)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 35,
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "{'vocabulary_size': 1001, 'remove_stopwords': False, 'ignore_stopwords': True}"
+            ]
+          },
+          "execution_count": 35,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "bayes_bows_best_param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 36,
+      "metadata": {
+        "id": "6YJEuNYRACLx"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "((5016, 1001), (558, 1001))"
+            ]
+          },
+          "execution_count": 36,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "bow = BagOfWordsStem(**bayes_bows_best_param)\n",
+        "X_train_bow = bow.fit_transform(X_train)\n",
+        "X_test_bow = bow.transform(X_test)\n",
+        "\n",
+        "X_train_bow.shape, X_test_bow.shape"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 37,
+      "metadata": {
+        "id": "spb2TAg1ACLx"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "              precision    recall  f1-score   support\n",
+            "\n",
+            "         ham       0.99      0.99      0.99       483\n",
+            "        spam       0.93      0.95      0.94        75\n",
+            "\n",
+            "    accuracy                           0.98       558\n",
+            "   macro avg       0.96      0.97      0.97       558\n",
+            "weighted avg       0.98      0.98      0.98       558\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "model = NaiveBayes(alpha=1.0)\n",
+        "model.fit(X_train_bow, y_train)\n",
+        "\n",
+        "y_pred = model.predict(X_test_bow)\n",
+        "print(classification_report(y_test, y_pred))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Accuracy модели составила 0.98.\n",
+        "\n",
+        "Recall для spam достиг 0.95, что выше, чем у моделей на основе LogisticRegression. Значит, что NaiveBayes лучше других идентифицирует спам-сообщения, пропуская меньше из них. При этом precision для spam ниже, чем у LogisticRegression, что указывает на большее количество ложных срабатываний. Метрики для ham остаются высокими.\n",
+        "\n",
+        "Скорее всего это происходит из-за того, что NaiveBayes предполагает, что все слова-признаки независимы друг от друга при условии класса. Некоторые слова могут часто встречаться вместе в спам-сообщениях, и их комбинация является сильным индикатором спама. Логистическая регрессия, не делает такого предположения и может лучше улавливать такие зависимости, учась находить гиперплоскость, а комбинации признаков могут влиять на ее положение."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/src/homeworks/homework8/texts.pdf b/src/homeworks/homework8/texts.pdf
new file mode 100644
index 0000000..c96576b
Binary files /dev/null and b/src/homeworks/homework8/texts.pdf differ