diff --git a/src/homeworks/homework8/texts.ipynb b/src/homeworks/homework8/texts.ipynb new file mode 100644 index 0000000..64d2494 --- /dev/null +++ b/src/homeworks/homework8/texts.ipynb @@ -0,0 +1,1085 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "WYC9jXnTRvW3" + }, + "source": [ + "# Работа с текстом\n", + "\n", + "В этом домашнем задании вам предстоит поработать с текстовыми данными и научиться находить спам сообщения!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "gmljLzJDRvW4" + }, + "outputs": [], + "source": [ + "import inspect\n", + "import math\n", + "import random\n", + "import re\n", + "from collections import Counter, defaultdict\n", + "from string import punctuation\n", + "\n", + "import numpy as np\n", + "from nltk import SnowballStemmer, download\n", + "from nltk.corpus import stopwords\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "from sklearn.model_selection import KFold, train_test_split\n", + "from scipy.special import logsumexp" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "nllPeX1xACLr" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/sashka/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "download(\"stopwords\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "XTU13-rOACLr", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def set_seed(seed=42):\n", + " np.random.seed(seed)\n", + " random.seed(seed)\n", + "\n", + "\n", + "# Этой функцией будут помечены все места, которые необходимо дозаполнить\n", + "# Это могут быть как целые функции, так и отдельные части внутри них\n", + "# Всегда можно воспользоваться интроспекцией и найти места использования этой функции :)\n", + "def todo():\n", + " stack = inspect.stack()\n", + " caller_frame = stack[1]\n", + " function_name = caller_frame.function\n", + " line_number = caller_frame.lineno\n", + " raise NotImplementedError(f\"TODO at {function_name}, line {line_number}\")\n", + "\n", + "\n", + "SEED = 0xC0FFEE\n", + "set_seed(SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "VRJVvs51RvW4" + }, + "outputs": [], + "source": [ + "def read_dataset(filename):\n", + " x, y = [], []\n", + " with open(filename, encoding=\"utf-8\") as file:\n", + " for line in file:\n", + " cl, sms = re.split(r\"^(ham|spam)[\\t\\s]+(.*)$\", line)[1:3]\n", + " x.append(sms)\n", + " y.append(cl)\n", + " return x, y" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "9YKoYXwsRvW5" + }, + "outputs": [], + "source": [ + "X, y = read_dataset(\"src/homeworks/homework8/spam.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "GCmIbwv6RvW5" + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=SEED, stratify=y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "Isg1F2ClACLt", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ham: Two fundamentals of cool life: \"Walk, like you are the KING\"...! OR \"Walk like you Dont care,whoever is the KING\"!... Gud nyt\n", + "ham: Haha... Where got so fast lose weight, thk muz go 4 a month den got effect... Gee,later we go aust put bk e weight.\n", + "ham: I wish things were different. I wonder when i will be able to show you how much i value you. Pls continue the brisk walks no drugs without askin me please and find things to laugh about. I love you dearly.\n", + "ham: Tmr then ü brin lar... Aiya later i come n c lar... Mayb ü neva set properly ü got da help sheet wif ü...\n", + "ham: For many things its an antibiotic and it can be used for chest abdomen and gynae infections even bone infections.\n" + ] + } + ], + "source": [ + "for x_, y_ in zip(X_train[:5], y_train[:5]):\n", + " print(f\"{y_}: {x_}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "kX5UHxOiACLu" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'ham': 4344, 'spam': 672})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Counter(y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HzAcgm1rACLu" + }, + "source": [ + "## Bag of Words (2 балла)\n", + "\n", + "Реализуйте простой подсчет слов в тексте, в качестве токенизатора делите по пробелу, убрав перед этим все знаки пунктуации и приведя к нижнему регистру.\n", + "\n", + "После этого обучите простую логистическую модель, измерьте ее качество и сделайте выводы." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "lZsEPDLBACLu" + }, + "outputs": [], + "source": [ + "class BagOfWords:\n", + " def __init__(self, vocabulary_size: int = 1000):\n", + " \"\"\"Init Bag-of-Words instance\n", + "\n", + " Args:\n", + " vocabulary_size: maximum number of tokens in vocabulary\n", + " \"\"\"\n", + " \n", + " self._vocabulary_size = vocabulary_size\n", + " self._vocabulary: dict[str, int] = None\n", + "\n", + " def _tokenize(self, sentence: str) -> list[str]:\n", + " sentence = sentence.lower()\n", + " translator = str.maketrans('', '', punctuation)\n", + " sentence = sentence.translate(translator)\n", + " tokens = sentence.split()\n", + " \n", + " return [token for token in tokens if token]\n", + "\n", + " def fit(self, sentences: list[str]):\n", + " \"\"\"Fit Bag-of-Words based on list of sentences\"\"\"\n", + "\n", + " all_tokens = []\n", + "\n", + " for sentence in sentences:\n", + " all_tokens.extend(self._tokenize(sentence))\n", + "\n", + " token_counts = Counter(all_tokens)\n", + " most_common_tokens = token_counts.most_common(self._vocabulary_size)\n", + "\n", + " self._vocabulary = {token: i for i, (token, _) in enumerate(most_common_tokens)}\n", + "\n", + " def transform(self, sentences: list[str]) -> np.ndarray:\n", + " \"\"\"Vectorize texts using built vocabulary\n", + "\n", + " Args:\n", + " sentences: list of sentences to vectorize\n", + "\n", + " Return:\n", + " transformed texts, matrix of (n_sentences, vocab_size)\n", + " \"\"\"\n", + "\n", + " if self._vocabulary is None:\n", + " raise RuntimeError(\"Fit before transforming!\")\n", + " \n", + " num_features = len(self._vocabulary)\n", + "\n", + " bow_matrix = np.zeros((len(sentences), num_features), dtype=int)\n", + "\n", + " for i, sentence in enumerate(sentences):\n", + " tokens = self._tokenize(sentence)\n", + " for token in tokens:\n", + " if token in self._vocabulary:\n", + " token_idx = self._vocabulary[token]\n", + " bow_matrix[i, token_idx] += 1\n", + " \n", + " return bow_matrix\n", + "\n", + " def fit_transform(self, sentences: list[str]) -> np.ndarray:\n", + " self.fit(sentences)\n", + " return self.transform(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def get_accuracy(bow_model, size: int, param: dict, model) -> int:\n", + " bow = bow_model(vocabulary_size=size, **param)\n", + " \n", + " X_train_search, X_val_search, y_train_search, y_val_search = train_test_split(X_train, y_train, test_size=0.1, random_state=SEED, stratify=y_train)\n", + "\n", + " X_train_bow = bow.fit_transform(X_train_search)\n", + " X_val_bow = bow.transform(X_val_search)\n", + "\n", + " model = model()\n", + " model.fit(X_train_bow, y_train_search)\n", + "\n", + " y_pred = model.predict(X_val_bow)\n", + " return accuracy_score(y_val_search, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_best_param(bow_model, params: list[dict] = None, model=LogisticRegression) -> dict:\n", + " sizes = range(1, 3500, 100)\n", + " best_accuracy = -1\n", + " best_params = {}\n", + "\n", + " if params is None:\n", + " params = [{}]\n", + "\n", + " for size in sizes:\n", + " for param in params:\n", + " cur_accuracy = get_accuracy(bow_model, size, param, model)\n", + " \n", + " if cur_accuracy > best_accuracy:\n", + " best_accuracy = cur_accuracy\n", + " best_params = {\"vocabulary_size\": size, **param}\n", + " \n", + " return best_params" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "VuSY9FEARvW5" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((5016, 701), (558, 701))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bow_best_param = get_best_param(BagOfWords)\n", + "\n", + "bow = BagOfWords(**bow_best_param)\n", + "X_train_bow = bow.fit_transform(X_train)\n", + "X_test_bow = bow.transform(X_test)\n", + "\n", + "X_train_bow.shape, X_test_bow.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vocabulary_size': 701}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bow_best_param" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "_cKMLYwMACLv" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.99 1.00 0.99 483\n", + " spam 0.99 0.93 0.96 75\n", + "\n", + " accuracy 0.99 558\n", + " macro avg 0.99 0.97 0.98 558\n", + "weighted avg 0.99 0.99 0.99 558\n", + "\n" + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train_bow, y_train)\n", + "\n", + "y_pred = model.predict(X_test_bow)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оптимальный размер словаря: 701\n", + "\n", + "Метрики при нем достаточно высокие: ham определяется почти идеально. Для spam относительно низкое значение recall: модель пропускает spam." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KI089XBzACLv" + }, + "source": [ + "## Обработка текста (1 балл)\n", + "\n", + "Добавьте на этапе токенизатора удаление стоп-слов и стемминг, для этого можно воспользоваться [`SnowballStemmer`](https://www.nltk.org/api/nltk.stem.SnowballStemmer.html) из библиотеки `nltk`.\n", + "\n", + "⚠️ `nltk` уже довольно устаревшая библиотека и скорее не рекомендуется ее использовать, однако в учебных целях более чем достаточно.\n", + "\n", + "Обучите логистическую регрессию, попробуйте по-разному комбинировать стемминг и удаление стоп-слов, сделайте выводы." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "yttB8gqVACLv" + }, + "outputs": [], + "source": [ + "class BagOfWordsStem(BagOfWords):\n", + " def __init__(\n", + " self,\n", + " vocabulary_size: int,\n", + " language: str = \"english\",\n", + " ignore_stopwords: bool = True,\n", + " remove_stopwords: bool = True,\n", + " ):\n", + " super().__init__(vocabulary_size)\n", + "\n", + " if remove_stopwords and not ignore_stopwords:\n", + " raise ValueError(\"To remove stop-words they should be ignored by stemmer\")\n", + " \n", + " self._stemmer = SnowballStemmer(language)\n", + " self._stopwords = set(stopwords.words(language))\n", + " \n", + " self._remove_stopwords = remove_stopwords\n", + " self._ignore_stopwords = ignore_stopwords\n", + "\n", + " def _tokenize(self, sentence: str) -> list[str]:\n", + " tokens = super()._tokenize(sentence)\n", + "\n", + " processed_tokens = []\n", + "\n", + " for token in tokens:\n", + " is_stopword = token in self._stopwords\n", + "\n", + " if is_stopword and self._remove_stopwords:\n", + " continue \n", + " \n", + " if is_stopword and self._ignore_stopwords:\n", + " processed_tokens.append(token)\n", + " else:\n", + " processed_tokens.append(self._stemmer.stem(token))\n", + " \n", + " return processed_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "7ROhMn0bACLv" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((5016, 401), (558, 401))" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = [{\"remove_stopwords\": True, \"ignore_stopwords\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": False}]\n", + "\n", + "bows_best_param = get_best_param(BagOfWordsStem, params)\n", + "\n", + "bow = BagOfWordsStem(**bows_best_param)\n", + "X_train_bow = bow.fit_transform(X_train)\n", + "X_test_bow = bow.transform(X_test)\n", + "\n", + "X_train_bow.shape, X_test_bow.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vocabulary_size': 401, 'remove_stopwords': False, 'ignore_stopwords': True}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bows_best_param" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "yUU-BcQ1ACLv" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.99 1.00 0.99 483\n", + " spam 0.99 0.93 0.96 75\n", + "\n", + " accuracy 0.99 558\n", + " macro avg 0.99 0.97 0.98 558\n", + "weighted avg 0.99 0.99 0.99 558\n", + "\n" + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train_bow, y_train)\n", + "\n", + "y_pred = model.predict(X_test_bow)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Оптимальный размер словаря: 401, remove_stopwords: False, ignore_stopwords: True\n", + "\n", + "Лучшее значение accuracy достигается при неудалении стоп-слов и их игнорировании стеммером. То есть стоп-слова сохраняются в их изначальной форме. Оптимальный размер словаря ожидаемо меньше, чем при методе без стемминга.\n", + "\n", + "Метрики не изменились, значит достигли такой же точности с меньшим словарем." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2_IOo98iACLw" + }, + "source": [ + "## TF-IDF (2 балла)\n", + "\n", + "Доработайте предыдущий класс до полноценного Tf-Idf, затем, аналогично, проведите эксперименты с логистической регрессией." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "class Tokenizer:\n", + " def __init__(\n", + " self,\n", + " vocabulary_size: int,\n", + " language: str = \"english\",\n", + " ignore_stopwords: bool = True,\n", + " remove_stopwords: bool = True,\n", + " ):\n", + " self._vocabulary_size = vocabulary_size\n", + " self._vocabulary: dict[str, int] = None\n", + "\n", + " if remove_stopwords and not ignore_stopwords:\n", + " raise ValueError(\"To remove stop-words they should be ignored by stemmer\")\n", + " \n", + " self._stemmer = SnowballStemmer(language)\n", + " self._stopwords = set(stopwords.words(language))\n", + " \n", + " self._remove_stopwords = remove_stopwords\n", + " self._ignore_stopwords = ignore_stopwords\n", + " \n", + " def tokenize(self, sentence: str) -> list[str]:\n", + " sentence = sentence.lower()\n", + " translator = str.maketrans('', '', punctuation)\n", + " sentence = sentence.translate(translator)\n", + " tokens = sentence.split()\n", + " \n", + " tokens = [token for token in tokens if token]\n", + " processed_tokens = []\n", + "\n", + " for token in tokens:\n", + " is_stopword = token in self._stopwords\n", + "\n", + " if is_stopword and self._remove_stopwords:\n", + " continue \n", + " \n", + " if is_stopword and self._ignore_stopwords:\n", + " processed_tokens.append(token)\n", + " else:\n", + " processed_tokens.append(self._stemmer.stem(token))\n", + " \n", + " return processed_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "6rzQ5sUOACLw" + }, + "outputs": [], + "source": [ + "class TFIDFVectorizer:\n", + " def __init__(\n", + " self,\n", + " vocabulary_size: int,\n", + " language: str = \"english\",\n", + " ignore_stopwords: bool = True,\n", + " remove_stopwords: bool = True,\n", + " use_idf: bool = False,\n", + " ):\n", + " self._vocabulary_size = vocabulary_size\n", + " self._vocabulary = None\n", + " self._idf = None\n", + " self._use_idf = use_idf\n", + "\n", + " self._tokenizer = Tokenizer(vocabulary_size, language, ignore_stopwords, remove_stopwords)\n", + "\n", + " def _tokenize(self, sentence: str) -> list[str]:\n", + " return self._tokenizer.tokenize(sentence)\n", + "\n", + " def fit(self, sentences: list[str]):\n", + " \"\"\"Build vocabulary and compute IDF\"\"\"\n", + " \n", + " all_tokens = []\n", + " document_frequency = defaultdict(int)\n", + "\n", + " for sentence in sentences:\n", + " tokens = self._tokenize(sentence)\n", + " all_tokens.extend(tokens)\n", + " \n", + " for token in set(tokens):\n", + " document_frequency[token] += 1\n", + "\n", + " token_counts = Counter(all_tokens)\n", + " most_common_tokens = token_counts.most_common(self._vocabulary_size)\n", + "\n", + " self._vocabulary = {token: i for i, (token, _) in enumerate(most_common_tokens)}\n", + "\n", + " n_sentences = len(sentences)\n", + " \n", + " if self._use_idf:\n", + " self._idf = np.zeros(len(self._vocabulary))\n", + " for token, i in self._vocabulary.items():\n", + " n_w = document_frequency.get(token, 0)\n", + " self._idf[i] = np.log(n_sentences / (n_w + 1.0)) + 1.0\n", + " else:\n", + " self._idf = np.ones(len(self._vocabulary))\n", + "\n", + " def transform(self, sentences: list[str]) -> np.ndarray:\n", + " \"\"\"Transform sentences to TF-IDF vectors\"\"\"\n", + "\n", + " n_sentences = len(sentences)\n", + " n_features = len(self._vocabulary)\n", + " tfidf_matrix = np.zeros((n_sentences, n_features), dtype=float)\n", + "\n", + " for i, sentence in enumerate(sentences):\n", + " tokens = self._tokenize(sentence)\n", + " if not tokens:\n", + " continue\n", + " \n", + " sentence_len = len(tokens)\n", + " token_counts_in_sentence = Counter(tokens)\n", + "\n", + " for token, count in token_counts_in_sentence.items():\n", + " if token in self._vocabulary:\n", + " token_idx = self._vocabulary[token]\n", + " \n", + " if self._use_idf:\n", + " tfidf_matrix[i, token_idx] = count / sentence_len * self._idf[token_idx]\n", + " else:\n", + " tfidf_matrix[i, token_idx] = count\n", + " \n", + " return tfidf_matrix\n", + "\n", + " def fit_transform(self, sentences: list[str]) -> np.ndarray:\n", + " self.fit(sentences)\n", + " return self.transform(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((5016, 701), (558, 701))" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "params = [\n", + " {\"remove_stopwords\": True, \"ignore_stopwords\": True, \"use_idf\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": True, \"use_idf\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": False, \"use_idf\": True}\n", + "]\n", + "tfidf_best_param = get_best_param(TFIDFVectorizer, params)\n", + "\n", + "tfidf = TFIDFVectorizer(**tfidf_best_param)\n", + "X_train_tfidf = tfidf.fit_transform(X_train)\n", + "X_test_tfidf = tfidf.transform(X_test)\n", + "\n", + "X_train_tfidf.shape, X_test_tfidf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vocabulary_size': 701,\n", + " 'remove_stopwords': False,\n", + " 'ignore_stopwords': True,\n", + " 'use_idf': True}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tfidf_best_param" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf = TFIDFVectorizer(**tfidf_best_param)\n", + "X_train_tfidf = tfidf.fit_transform(X_train)\n", + "X_test_tfidf = tfidf.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "sg_Gac-jACLw" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.97 1.00 0.98 483\n", + " spam 0.97 0.83 0.89 75\n", + "\n", + " accuracy 0.97 558\n", + " macro avg 0.97 0.91 0.94 558\n", + "weighted avg 0.97 0.97 0.97 558\n", + "\n" + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train_tfidf, y_train)\n", + "\n", + "y_pred = model.predict(X_test_tfidf)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "При использовании idf:\n", + "\n", + "Оптимальный размер словаря: 701, remove_stopwords: False, ignore_stopwords: True\n", + "\n", + "Лучшее значение accuracy так же как и для BagOfWordsStem достигается при неудалении стоп-слов и их игнорировании стеммером.\n", + "\n", + "Метрики в целом снизились. Скорее всего это связано с тем, что слова, которые часто встречаются в спаме (низкий IDF), являются сильными идентификаторами спама, а TF-IDF снижает их занчимость." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oJe5cDxDRvW5" + }, + "source": [ + "## NaiveBayes (5 баллов)\n", + "\n", + "Наивный байесовский классификатор — это простой и эффективный алгоритм машинного обучения, основанный на теореме Байеса с наивным предположением независимости признаков.\n", + "\n", + "### Формула Байеса\n", + "\n", + "$$\n", + "P(A|B) = \\frac{P(B|A) \\cdot P(A)}{P(B)}\n", + "$$\n", + "\n", + "В контексте классификации текста это значит: $P(класс | документ) \\propto P(класс) \\cdot P(документ | класс)$\n", + "\n", + "Почему \"наивность\"? Потому что предпологаем, что все слова независимы:\n", + "\n", + "$$\n", + "P(w_1, w_2, \\dots | class) = P(w_1 | class) \\cdot P(w_2 | class) \\cdot \\dots\n", + "$$\n", + "\n", + "### Классификация текста\n", + "\n", + "Таким образом, для классификации текста необходимо:\n", + "\n", + "1. Вычислить априорную вероятность класса: $P(class)$, доля документов с таким классом\n", + "2. Вычислить правдоподобие: $P(text | class) = \\prod_{i=1}^n P(w_i | class)$\n", + "\n", + "_Примечание:_ $P(w_i | class)$ — это частота слова в данном классе относительно всех слов в классе, при этом зачастую добавляют сглаживание Лапласа в качестве регуляризатора\n", + "$$\n", + "P(w_i | class) = \\frac{\\text{частота слова в классе} + \\alpha}{\\text{сумма всех слов в классе} + \\alpha \\cdot |V|}\n", + "$$\n", + "\n", + "После этого, необходимо выбрать наиболее вероятный класс для данного текста:\n", + "\n", + "$$\n", + "class = \\arg \\max\\limits_{c} \\Big[ P(c) \\cdot P(text | c) \\Big] = \\arg \\max\\limits_{c} \\Big[ \\log P(c) + \\sum_{i=1}^n \\log P(w_i | c) \\Big]\n", + "$$\n", + "\n", + "### Реализация\n", + "\n", + "`fit(X, y)` - оценивает параметры распределения `p(x|y)` для каждого `y`.\n", + "\n", + "`log_proba(X)` - для каждого элемента набора `X` считает логарифм вероятности отнести его к каждому классу." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "cQL-8wxwRvW5" + }, + "outputs": [], + "source": [ + "class NaiveBayes:\n", + "\n", + " def __init__(self, alpha: float = 1.0):\n", + " \"\"\"\n", + " Args:\n", + " alpha: regularization coefficient\n", + " \"\"\"\n", + " self.alpha = alpha\n", + " self._classes = None # [n classes]\n", + " self._vocab_size = None # int\n", + " self._log_p_y = None # [n classes]\n", + " self._log_p_x_y = None # [n classes, vocab size]\n", + "\n", + " def fit(self, features: np.ndarray, targets: list[str]):\n", + " \"\"\"Estimate p(x|y) and p(y) based on data\n", + "\n", + " Args:\n", + " features, [n samples; vocab size]: input features\n", + " targets, [n samples]: targets\n", + " \"\"\"\n", + " targets = np.array(targets)\n", + " \n", + " self._classes = np.unique(targets)\n", + " n_classes = len(self._classes)\n", + " n_samples, self._vocab_size = features.shape\n", + "\n", + " self._log_p_y = np.zeros(n_classes, dtype=np.float64)\n", + " self._log_p_x_y = np.zeros((n_classes, self._vocab_size), dtype=np.float64)\n", + "\n", + " for i, cls in enumerate(self._classes):\n", + " features_cls = features[targets == cls]\n", + "\n", + " n_samples_in_class = features_cls.shape[0]\n", + " \n", + " if n_samples_in_class == 0:\n", + " self._log_p_y[i] = -np.inf\n", + " else:\n", + " self._log_p_y[i] = np.log(n_samples_in_class / n_samples)\n", + "\n", + " feature_counts_in_class = np.sum(features_cls, axis=0)\n", + " total_features_in_class = np.sum(feature_counts_in_class)\n", + "\n", + " numerator = feature_counts_in_class + self.alpha\n", + " denominator = total_features_in_class + self.alpha * self._vocab_size\n", + "\n", + " if denominator == 0:\n", + " self._log_p_x_y[i, :] = -np.inf\n", + " else:\n", + " self._log_p_x_y[i, :] = np.log(numerator / denominator)\n", + "\n", + "\n", + " def predict(self, features: np.ndarray) -> np.ndarray:\n", + " \"\"\"Predict class for each sample\n", + "\n", + " Args:\n", + " features, [n samples; vocab size]: feature to predict\n", + " Return:\n", + " classes, [n samples]: predicted class\n", + " \"\"\"\n", + " log_probabilities = self.log_proba(features)\n", + " predicted_class_indices = np.argmax(log_probabilities, axis=1)\n", + " predicted_classes = self._classes[predicted_class_indices]\n", + " return predicted_classes\n", + "\n", + " def log_proba(self, features: np.ndarray) -> np.ndarray:\n", + " \"\"\"Calculate p(y|x) for each class and each sample\n", + "\n", + " Args:\n", + " features, [n samples; vocab size]: feature to predict\n", + " Return:\n", + " classes, [n samples; n classes]: log proba for each class\n", + " \"\"\"\n", + " if self._vocab_size is None:\n", + " raise RuntimeError(\"Fit classifier before predicting something\")\n", + " if features.shape[1] != self._vocab_size:\n", + " raise RuntimeError(\n", + " f\"Unexpected size of vocabulary, expected {self._vocab_size}, actual {features.shape[1]}\"\n", + " )\n", + "\n", + " n_samples = features.shape[0]\n", + " log_probabilities = (features @ self._log_p_x_y.T) + self._log_p_y[np.newaxis, :]\n", + " log_p_x = logsumexp(log_probabilities, axis=1, keepdims=True)\n", + " log_posterior_proba = log_probabilities - log_p_x\n", + " return log_posterior_proba" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "params = [{\"remove_stopwords\": True, \"ignore_stopwords\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": True},\n", + " {\"remove_stopwords\": False, \"ignore_stopwords\": False}]\n", + "\n", + "bayes_bows_best_param = get_best_param(BagOfWordsStem, params, model=NaiveBayes)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vocabulary_size': 1001, 'remove_stopwords': False, 'ignore_stopwords': True}" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bayes_bows_best_param" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "6YJEuNYRACLx" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((5016, 1001), (558, 1001))" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bow = BagOfWordsStem(**bayes_bows_best_param)\n", + "X_train_bow = bow.fit_transform(X_train)\n", + "X_test_bow = bow.transform(X_test)\n", + "\n", + "X_train_bow.shape, X_test_bow.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "id": "spb2TAg1ACLx" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " ham 0.99 0.99 0.99 483\n", + " spam 0.93 0.95 0.94 75\n", + "\n", + " accuracy 0.98 558\n", + " macro avg 0.96 0.97 0.97 558\n", + "weighted avg 0.98 0.98 0.98 558\n", + "\n" + ] + } + ], + "source": [ + "model = NaiveBayes(alpha=1.0)\n", + "model.fit(X_train_bow, y_train)\n", + "\n", + "y_pred = model.predict(X_test_bow)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Accuracy модели составила 0.98.\n", + "\n", + "Recall для spam достиг 0.95, что выше, чем у моделей на основе LogisticRegression. Значит, что NaiveBayes лучше других идентифицирует спам-сообщения, пропуская меньше из них. При этом precision для spam ниже, чем у LogisticRegression, что указывает на большее количество ложных срабатываний. Метрики для ham остаются высокими.\n", + "\n", + "Скорее всего это происходит из-за того, что NaiveBayes предполагает, что все слова-признаки независимы друг от друга при условии класса. Некоторые слова могут часто встречаться вместе в спам-сообщениях, и их комбинация является сильным индикатором спама. Логистическая регрессия, не делает такого предположения и может лучше улавливать такие зависимости, учась находить гиперплоскость, а комбинации признаков могут влиять на ее положение." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/homeworks/homework8/texts.pdf b/src/homeworks/homework8/texts.pdf new file mode 100644 index 0000000..c96576b Binary files /dev/null and b/src/homeworks/homework8/texts.pdf differ