From 14560c1dfa04c063540aa28c692b7e5fbdd511d6 Mon Sep 17 00:00:00 2001 From: JoseVelazquezR Date: Thu, 24 Mar 2022 01:32:14 -0600 Subject: [PATCH 1/2] lab terminado --- .../challenge-1-checkpoint.ipynb | 1016 +++++++++++++++++ .../challenge-2-checkpoint.ipynb | 197 +++- your-code/challenge-1.ipynb | 816 ++++++++++++- your-code/challenge-2.ipynb | 197 +++- 4 files changed, 2161 insertions(+), 65 deletions(-) create mode 100644 your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb diff --git a/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb new file mode 100644 index 0000000..0859d23 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb @@ -0,0 +1,1016 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# String Operations Lab\n", + "\n", + "**Before your start:**\n", + "\n", + "- Read the README.md file\n", + "- Comment as much as you can and use the resources in the README.md file\n", + "- Happy learning!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 1 - Combining Strings\n", + "\n", + "Combining strings is an important skill to acquire. There are multiple ways of combining strings in Python, as well as combining strings with variables. We will explore this in the first challenge. In the cell below, combine the strings in the list and add spaces between the strings (do not add a space after the last string). Insert a period after the last string." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Durante un tiempo no estuvo segura de si su marido era su marido.'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_list = ['Durante', 'un', 'tiempo', 'no', 'estuvo', 'segura', 'de', 'si', 'su', 'marido', 'era', 'su', 'marido']\n", + "# Your code here:\n", + "\" \".join(str_list) + \".\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, use the list of strings to create a grocery list. Start the list with the string `Grocery list: ` and include a comma and a space between each item except for the last one. Include a period at the end. Only include foods in the list that start with the letter 'b' and ensure all foods are lower case." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bananas',\n", + " 'chocolate',\n", + " 'bread',\n", + " 'diapers',\n", + " 'ice cream',\n", + " 'brownie mix',\n", + " 'broccoli']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "food_list = ['Bananas', 'Chocolate', 'bread', 'diapers', 'Ice Cream', 'Brownie Mix', 'broccoli']\n", + "# Your code here:\n", + "# convertir lista original a nueva lista solo con minusculas\n", + "grocery_list = [i.lower() for i in food_list]\n", + "grocery_list" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bananas chocolate bread diapers ice cream brownie mix broccoli'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# convertir lista en string único\n", + "a = \" \".join(grocery_list)\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bananas', 'bread', 'brownie', 'broccoli']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# obtener sólo las palabras que empiezan por b\n", + "b = re.findall(r\"b.+?\\b\", a)\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bananas, bread, brownie, broccoli.'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# unir palabras obtenidas\n", + "\", \".join(b) + \".\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, write a function that computes the area of a circle using its radius. Compute the area of the circle and insert the radius and the area between the two strings. Make sure to include spaces between the variable and the strings. \n", + "\n", + "Note: You can use the techniques we have learned so far or use f-strings. F-strings allow us to embed code inside strings. You can read more about f-strings [here](https://www.python.org/dev/peps/pep-0498/)." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "78.53981633974483" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import math\n", + "pi = math.pi\n", + "circle_area = pi*(5**2)\n", + "circle_area" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The area of the circle with radius: 5 is: 78.53981633974483\n" + ] + } + ], + "source": [ + "import math\n", + "\n", + "string1 = \"The area of the circle with radius:\"\n", + "string2 = \"is:\"\n", + "radius = 4.5\n", + "\n", + "def area(x, pi = math.pi):\n", + " # This function takes a radius and returns the area of a circle. We also pass a default value for pi.\n", + " # Input: Float (and default value for pi)\n", + " circle_area = pi*(x**2)\n", + " # Output: Float\n", + " return print(f\"{string1} {x} {string2} {circle_area}\")\n", + " \n", + "# Your code here: \n", + "# Sample input: 5.0\n", + "# Sample Output: 78.53981633\n", + "area(5)\n", + "# Your output string here:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 2 - Splitting Strings\n", + "\n", + "We have first looked at combining strings into one long string. There are times where we need to do the opposite and split the string into smaller components for further analysis. \n", + "\n", + "In the cell below, split the string into a list of strings using the space delimiter. Count the frequency of each word in the string in a dictionary. Strip the periods, line breaks and commas from the text. Make sure to remove empty strings from your dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "poem = \"\"\"Some say the world will end in fire,\n", + "Some say in ice.\n", + "From what I’ve tasted of desire\n", + "I hold with those who favor fire.\n", + "But if it had to perish twice,\n", + "I think I know enough of hate\n", + "To say that for destruction ice\n", + "Is also great\n", + "And would suffice.\"\"\"\n", + "\n", + "# Your code here:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on module re:\n", + "\n", + "NAME\n", + " re - Support for regular expressions (RE).\n", + "\n", + "MODULE REFERENCE\n", + " https://docs.python.org/3.10/library/re.html\n", + " \n", + " The following documentation is automatically generated from the Python\n", + " source files. It may be incomplete, incorrect or include features that\n", + " are considered implementation detail and may vary between Python\n", + " implementations. When in doubt, consult the module reference at the\n", + " location listed above.\n", + "\n", + "DESCRIPTION\n", + " This module provides regular expression matching operations similar to\n", + " those found in Perl. It supports both 8-bit and Unicode strings; both\n", + " the pattern and the strings being processed can contain null bytes and\n", + " characters outside the US ASCII range.\n", + " \n", + " Regular expressions can contain both special and ordinary characters.\n", + " Most ordinary characters, like \"A\", \"a\", or \"0\", are the simplest\n", + " regular expressions; they simply match themselves. You can\n", + " concatenate ordinary characters, so last matches the string 'last'.\n", + " \n", + " The special characters are:\n", + " \".\" Matches any character except a newline.\n", + " \"^\" Matches the start of the string.\n", + " \"$\" Matches the end of the string or just before the newline at\n", + " the end of the string.\n", + " \"*\" Matches 0 or more (greedy) repetitions of the preceding RE.\n", + " Greedy means that it will match as many repetitions as possible.\n", + " \"+\" Matches 1 or more (greedy) repetitions of the preceding RE.\n", + " \"?\" Matches 0 or 1 (greedy) of the preceding RE.\n", + " *?,+?,?? Non-greedy versions of the previous three special characters.\n", + " {m,n} Matches from m to n repetitions of the preceding RE.\n", + " {m,n}? Non-greedy version of the above.\n", + " \"\\\\\" Either escapes special characters or signals a special sequence.\n", + " [] Indicates a set of characters.\n", + " A \"^\" as the first character indicates a complementing set.\n", + " \"|\" A|B, creates an RE that will match either A or B.\n", + " (...) Matches the RE inside the parentheses.\n", + " The contents can be retrieved or matched later in the string.\n", + " (?aiLmsux) The letters set the corresponding flags defined below.\n", + " (?:...) Non-grouping version of regular parentheses.\n", + " (?P...) The substring matched by the group is accessible by name.\n", + " (?P=name) Matches the text matched earlier by the group named name.\n", + " (?#...) A comment; ignored.\n", + " (?=...) Matches if ... matches next, but doesn't consume the string.\n", + " (?!...) Matches if ... doesn't match next.\n", + " (?<=...) Matches if preceded by ... (must be fixed length).\n", + " (? str or tuple.\n", + " | Return subgroup(s) of the match by indices or names.\n", + " | For 0 returns the entire match.\n", + " | \n", + " | groupdict(self, /, default=None)\n", + " | Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.\n", + " | \n", + " | default\n", + " | Is used for groups that did not participate in the match.\n", + " | \n", + " | groups(self, /, default=None)\n", + " | Return a tuple containing all the subgroups of the match, from 1.\n", + " | \n", + " | default\n", + " | Is used for groups that did not participate in the match.\n", + " | \n", + " | span(self, group=0, /)\n", + " | For match object m, return the 2-tuple (m.start(group), m.end(group)).\n", + " | \n", + " | start(self, group=0, /)\n", + " | Return index of the start of the substring matched by group.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Class methods defined here:\n", + " | \n", + " | __class_getitem__(...) from builtins.type\n", + " | See PEP 585\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | endpos\n", + " | The index into the string beyond which the RE engine will not go.\n", + " | \n", + " | lastgroup\n", + " | The name of the last matched capturing group.\n", + " | \n", + " | lastindex\n", + " | The integer index of the last matched capturing group.\n", + " | \n", + " | pos\n", + " | The index into the string at which the RE engine started looking for a match.\n", + " | \n", + " | re\n", + " | The regular expression object.\n", + " | \n", + " | regs\n", + " | \n", + " | string\n", + " | The string passed to match() or search().\n", + " \n", + " class Pattern(builtins.object)\n", + " | Compiled regular expression object.\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __copy__(self, /)\n", + " | \n", + " | __deepcopy__(self, memo, /)\n", + " | \n", + " | __eq__(self, value, /)\n", + " | Return self==value.\n", + " | \n", + " | __ge__(self, value, /)\n", + " | Return self>=value.\n", + " | \n", + " | __gt__(self, value, /)\n", + " | Return self>value.\n", + " | \n", + " | __hash__(self, /)\n", + " | Return hash(self).\n", + " | \n", + " | __le__(self, value, /)\n", + " | Return self<=value.\n", + " | \n", + " | __lt__(self, value, /)\n", + " | Return self./?@#$%^&*_~'''\n", + "\n", + "for string in corpus:\n", + " if string.endswith(\".\") == True:\n", + " string = string.replace(\".\", \"\")\n", + " corpus += [string.lower()]\n", + " \n", + "corpus = corpus[3:6]\n", + "corpus" ] }, { @@ -152,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -172,11 +217,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write your code here" + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['ironhack', 'is', 'cool'],\n", + " ['i', 'love', 'ironhack'],\n", + " ['i', 'am', 'a', 'student', 'at', 'ironhack']]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack',\n", + " 'is',\n", + " 'cool',\n", + " 'i',\n", + " 'love',\n", + " 'ironhack',\n", + " 'i',\n", + " 'am',\n", + " 'a',\n", + " 'student',\n", + " 'at',\n", + " 'ironhack']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bag_of_words = [string for lista in corpus for string in lista]\n", + "bag_of_words" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "argument of type 'NoneType' is not iterable", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[1;32mIn [53]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m lista \u001b[38;5;129;01min\u001b[39;00m corpus:\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m string \u001b[38;5;129;01min\u001b[39;00m lista:\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mstring\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mbag_of_words\u001b[49m:\n\u001b[0;32m 5\u001b[0m bag_of_words \u001b[38;5;241m=\u001b[39m bag_of_words\u001b[38;5;241m.\u001b[39mappend(word)\n\u001b[0;32m 7\u001b[0m bag_of_words\n", + "\u001b[1;31mTypeError\u001b[0m: argument of type 'NoneType' is not iterable" + ] + } + ], + "source": [ + "# Write your code here\n", + "for lista in corpus:\n", + " for string in lista:\n", + " if string in bag_of_words:\n", + " bag_of_words = bag_of_words.append(word)\n", + "\n", + "bag_of_words" ] }, { @@ -192,9 +331,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "print(bag_of_words)" ] @@ -208,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -228,9 +375,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "print(term_freq)" ] @@ -268,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +464,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -323,7 +478,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.3" } }, "nbformat": 4, diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 4302084..0859d23 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -33,12 +33,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Durante un tiempo no estuvo segura de si su marido era su marido.'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "str_list = ['Durante', 'un', 'tiempo', 'no', 'estuvo', 'segura', 'de', 'si', 'su', 'marido', 'era', 'su', 'marido']\n", - "# Your code here:\n" + "# Your code here:\n", + "\" \".join(str_list) + \".\"" ] }, { @@ -50,12 +62,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['bananas',\n", + " 'chocolate',\n", + " 'bread',\n", + " 'diapers',\n", + " 'ice cream',\n", + " 'brownie mix',\n", + " 'broccoli']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "food_list = ['Bananas', 'Chocolate', 'bread', 'diapers', 'Ice Cream', 'Brownie Mix', 'broccoli']\n", - "# Your code here:\n" + "# Your code here:\n", + "# convertir lista original a nueva lista solo con minusculas\n", + "grocery_list = [i.lower() for i in food_list]\n", + "grocery_list" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bananas chocolate bread diapers ice cream brownie mix broccoli'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# convertir lista en string único\n", + "a = \" \".join(grocery_list)\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bananas', 'bread', 'brownie', 'broccoli']" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# obtener sólo las palabras que empiezan por b\n", + "b = re.findall(r\"b.+?\\b\", a)\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bananas, bread, brownie, broccoli.'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# unir palabras obtenidas\n", + "\", \".join(b) + \".\"" ] }, { @@ -69,9 +166,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "78.53981633974483" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import math\n", + "pi = math.pi\n", + "circle_area = pi*(5**2)\n", + "circle_area" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The area of the circle with radius: 5 is: 78.53981633974483\n" + ] + } + ], "source": [ "import math\n", "\n", @@ -82,14 +210,14 @@ "def area(x, pi = math.pi):\n", " # This function takes a radius and returns the area of a circle. We also pass a default value for pi.\n", " # Input: Float (and default value for pi)\n", + " circle_area = pi*(x**2)\n", " # Output: Float\n", - " \n", - " # Sample input: 5.0\n", - " # Sample Output: 78.53981633\n", - " \n", - " # Your code here:\n", - " \n", - " \n", + " return print(f\"{string1} {x} {string2} {circle_area}\")\n", + " \n", + "# Your code here: \n", + "# Sample input: 5.0\n", + "# Sample Output: 78.53981633\n", + "area(5)\n", "# Your output string here:" ] }, @@ -106,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -123,6 +251,528 @@ "# Your code here:\n" ] }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on module re:\n", + "\n", + "NAME\n", + " re - Support for regular expressions (RE).\n", + "\n", + "MODULE REFERENCE\n", + " https://docs.python.org/3.10/library/re.html\n", + " \n", + " The following documentation is automatically generated from the Python\n", + " source files. It may be incomplete, incorrect or include features that\n", + " are considered implementation detail and may vary between Python\n", + " implementations. When in doubt, consult the module reference at the\n", + " location listed above.\n", + "\n", + "DESCRIPTION\n", + " This module provides regular expression matching operations similar to\n", + " those found in Perl. It supports both 8-bit and Unicode strings; both\n", + " the pattern and the strings being processed can contain null bytes and\n", + " characters outside the US ASCII range.\n", + " \n", + " Regular expressions can contain both special and ordinary characters.\n", + " Most ordinary characters, like \"A\", \"a\", or \"0\", are the simplest\n", + " regular expressions; they simply match themselves. You can\n", + " concatenate ordinary characters, so last matches the string 'last'.\n", + " \n", + " The special characters are:\n", + " \".\" Matches any character except a newline.\n", + " \"^\" Matches the start of the string.\n", + " \"$\" Matches the end of the string or just before the newline at\n", + " the end of the string.\n", + " \"*\" Matches 0 or more (greedy) repetitions of the preceding RE.\n", + " Greedy means that it will match as many repetitions as possible.\n", + " \"+\" Matches 1 or more (greedy) repetitions of the preceding RE.\n", + " \"?\" Matches 0 or 1 (greedy) of the preceding RE.\n", + " *?,+?,?? Non-greedy versions of the previous three special characters.\n", + " {m,n} Matches from m to n repetitions of the preceding RE.\n", + " {m,n}? Non-greedy version of the above.\n", + " \"\\\\\" Either escapes special characters or signals a special sequence.\n", + " [] Indicates a set of characters.\n", + " A \"^\" as the first character indicates a complementing set.\n", + " \"|\" A|B, creates an RE that will match either A or B.\n", + " (...) Matches the RE inside the parentheses.\n", + " The contents can be retrieved or matched later in the string.\n", + " (?aiLmsux) The letters set the corresponding flags defined below.\n", + " (?:...) Non-grouping version of regular parentheses.\n", + " (?P...) The substring matched by the group is accessible by name.\n", + " (?P=name) Matches the text matched earlier by the group named name.\n", + " (?#...) A comment; ignored.\n", + " (?=...) Matches if ... matches next, but doesn't consume the string.\n", + " (?!...) Matches if ... doesn't match next.\n", + " (?<=...) Matches if preceded by ... (must be fixed length).\n", + " (? str or tuple.\n", + " | Return subgroup(s) of the match by indices or names.\n", + " | For 0 returns the entire match.\n", + " | \n", + " | groupdict(self, /, default=None)\n", + " | Return a dictionary containing all the named subgroups of the match, keyed by the subgroup name.\n", + " | \n", + " | default\n", + " | Is used for groups that did not participate in the match.\n", + " | \n", + " | groups(self, /, default=None)\n", + " | Return a tuple containing all the subgroups of the match, from 1.\n", + " | \n", + " | default\n", + " | Is used for groups that did not participate in the match.\n", + " | \n", + " | span(self, group=0, /)\n", + " | For match object m, return the 2-tuple (m.start(group), m.end(group)).\n", + " | \n", + " | start(self, group=0, /)\n", + " | Return index of the start of the substring matched by group.\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Class methods defined here:\n", + " | \n", + " | __class_getitem__(...) from builtins.type\n", + " | See PEP 585\n", + " | \n", + " | ----------------------------------------------------------------------\n", + " | Data descriptors defined here:\n", + " | \n", + " | endpos\n", + " | The index into the string beyond which the RE engine will not go.\n", + " | \n", + " | lastgroup\n", + " | The name of the last matched capturing group.\n", + " | \n", + " | lastindex\n", + " | The integer index of the last matched capturing group.\n", + " | \n", + " | pos\n", + " | The index into the string at which the RE engine started looking for a match.\n", + " | \n", + " | re\n", + " | The regular expression object.\n", + " | \n", + " | regs\n", + " | \n", + " | string\n", + " | The string passed to match() or search().\n", + " \n", + " class Pattern(builtins.object)\n", + " | Compiled regular expression object.\n", + " | \n", + " | Methods defined here:\n", + " | \n", + " | __copy__(self, /)\n", + " | \n", + " | __deepcopy__(self, memo, /)\n", + " | \n", + " | __eq__(self, value, /)\n", + " | Return self==value.\n", + " | \n", + " | __ge__(self, value, /)\n", + " | Return self>=value.\n", + " | \n", + " | __gt__(self, value, /)\n", + " | Return self>value.\n", + " | \n", + " | __hash__(self, /)\n", + " | Return hash(self).\n", + " | \n", + " | __le__(self, value, /)\n", + " | Return self<=value.\n", + " | \n", + " | __lt__(self, value, /)\n", + " | Return self./?@#$%^&*_~'''\n", + "\n", + "for string in corpus:\n", + " if string.endswith(\".\") == True:\n", + " string = string.replace(\".\", \"\")\n", + " corpus += [string.lower()]\n", + " \n", + "corpus = corpus[3:6]\n", + "corpus" ] }, { @@ -152,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -172,11 +217,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Write your code here" + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['ironhack', 'is', 'cool'],\n", + " ['i', 'love', 'ironhack'],\n", + " ['i', 'am', 'a', 'student', 'at', 'ironhack']]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", + "corpus" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack',\n", + " 'is',\n", + " 'cool',\n", + " 'i',\n", + " 'love',\n", + " 'ironhack',\n", + " 'i',\n", + " 'am',\n", + " 'a',\n", + " 'student',\n", + " 'at',\n", + " 'ironhack']" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bag_of_words = [string for lista in corpus for string in lista]\n", + "bag_of_words" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "argument of type 'NoneType' is not iterable", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Input \u001b[1;32mIn [53]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m lista \u001b[38;5;129;01min\u001b[39;00m corpus:\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m string \u001b[38;5;129;01min\u001b[39;00m lista:\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mstring\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mbag_of_words\u001b[49m:\n\u001b[0;32m 5\u001b[0m bag_of_words \u001b[38;5;241m=\u001b[39m bag_of_words\u001b[38;5;241m.\u001b[39mappend(word)\n\u001b[0;32m 7\u001b[0m bag_of_words\n", + "\u001b[1;31mTypeError\u001b[0m: argument of type 'NoneType' is not iterable" + ] + } + ], + "source": [ + "# Write your code here\n", + "for lista in corpus:\n", + " for string in lista:\n", + " if string in bag_of_words:\n", + " bag_of_words = bag_of_words.append(word)\n", + "\n", + "bag_of_words" ] }, { @@ -192,9 +331,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "print(bag_of_words)" ] @@ -208,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -228,9 +375,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "print(term_freq)" ] @@ -268,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +464,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -323,7 +478,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.3" } }, "nbformat": 4, From ae6620e2fa5971de588ba52d07ca4416ab5d19e7 Mon Sep 17 00:00:00 2001 From: JoseVelazquezR Date: Mon, 28 Mar 2022 23:01:06 -0600 Subject: [PATCH 2/2] Laboratorio corregido --- .../challenge-2-checkpoint.ipynb | 296 +++++++++++------- your-code/challenge-1.ipynb | 48 +-- your-code/challenge-2.ipynb | 296 +++++++++++------- 3 files changed, 402 insertions(+), 238 deletions(-) diff --git a/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb index cedc6fa..b8cb010 100644 --- a/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb +++ b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -97,7 +97,7 @@ "['Ironhack is cool.', 'I love Ironhack.', 'I am a student at Ironhack.']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -109,6 +109,7 @@ "for i in range(1,4):\n", " with open(f'doc{i}.txt', 'r') as file:\n", " data = file.read().rstrip() # valid only if text in file is a single line\n", + " #rstrip() reads from the RIGHT\n", " corpus += [data]\n", "\n", "corpus" @@ -123,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -161,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -170,7 +171,7 @@ "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -197,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -217,103 +218,47 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "corpus" + "# import re\n", + "# corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", + "# corpus" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[['ironhack', 'is', 'cool'],\n", - " ['i', 'love', 'ironhack'],\n", - " ['i', 'am', 'a', 'student', 'at', 'ironhack']]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import re\n", - "corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", - "corpus" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 'is', 'cool']\n", + "['i', 'love', 'ironhack']\n", + "['i', 'am', 'a', 'student', 'at', 'ironhack']\n" + ] + }, { "data": { "text/plain": [ - "['ironhack',\n", - " 'is',\n", - " 'cool',\n", - " 'i',\n", - " 'love',\n", - " 'ironhack',\n", - " 'i',\n", - " 'am',\n", - " 'a',\n", - " 'student',\n", - " 'at',\n", - " 'ironhack']" + "['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']" ] }, - "execution_count": 51, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bag_of_words = [string for lista in corpus for string in lista]\n", - "bag_of_words" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "argument of type 'NoneType' is not iterable", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "Input \u001b[1;32mIn [53]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m lista \u001b[38;5;129;01min\u001b[39;00m corpus:\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m string \u001b[38;5;129;01min\u001b[39;00m lista:\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mstring\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mbag_of_words\u001b[49m:\n\u001b[0;32m 5\u001b[0m bag_of_words \u001b[38;5;241m=\u001b[39m bag_of_words\u001b[38;5;241m.\u001b[39mappend(word)\n\u001b[0;32m 7\u001b[0m bag_of_words\n", - "\u001b[1;31mTypeError\u001b[0m: argument of type 'NoneType' is not iterable" - ] - } - ], - "source": [ - "# Write your code here\n", - "for lista in corpus:\n", - " for string in lista:\n", - " if string in bag_of_words:\n", - " bag_of_words = bag_of_words.append(word)\n", + "for doc in corpus:\n", + " words = doc.split()\n", + " print(words)\n", + " for word in words:\n", + " if word not in bag_of_words:\n", + " bag_of_words.append(word)\n", "\n", "bag_of_words" ] @@ -329,23 +274,6 @@ "If not, fix your code in the previous cell." ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], - "source": [ - "print(bag_of_words)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -357,11 +285,31 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 1, 0, 0, 0, 0, 0, 0]\n", + "[1, 0, 0, 1, 1, 0, 0, 0, 0]\n", + "[1, 0, 0, 1, 0, 1, 1, 1, 1]\n", + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" + ] + } + ], "source": [ "term_freq = []\n", "\n", - "# Write your code here" + "# Write your code here\n", + "\n", + "for i in range(len(corpus)):\n", + " temp_list =[]\n", + " for word in bag_of_words:\n", + " conteo = corpus[i].split().count(word)\n", + " temp_list.append(conteo)\n", + " print(temp_list)\n", + " term_freq.append(temp_list)\n", + "print(term_freq)" ] }, { @@ -382,7 +330,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[]\n" + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" ] } ], @@ -425,13 +373,70 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n" + ] + } + ], "source": [ "stop_words = ['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n", - "\n", + "print(stop_words)\n", "# Write your code below\n" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 'cool', 'love', 'student']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bag_of_words = [word for word in bag_of_words if word not in stop_words]\n", + "bag_of_words" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 0, 0]\n", + "[1, 0, 1, 0]\n", + "[1, 0, 0, 1]\n", + "[[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]]\n" + ] + } + ], + "source": [ + "term_freq = []\n", + "for i in range(len(corpus)):\n", + " temp_list =[]\n", + " for word in bag_of_words:\n", + " conteo = corpus[i].split().count(word)\n", + " temp_list.append(conteo)\n", + " print(temp_list)\n", + " term_freq.append(temp_list)\n", + "print(term_freq)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -460,6 +465,83 @@ " [1 1 0 1 0 0 1]]\n", " ```" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']\n" + ] + } + ], + "source": [ + "print(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 0 1 1 1 0 0]\n", + " [0 0 0 1 0 1 0]\n", + " [1 1 0 1 0 0 1]]\n", + "{'ironhack': 3, 'is': 4, 'cool': 2, 'love': 5, 'am': 0, 'student': 6, 'at': 1}\n" + ] + } + ], + "source": [ + "vector = CountVectorizer()\n", + "print(vector.fit_transform(corpus).todense())\n", + "print(vector.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1 1 0 0]\n", + " [0 1 1 0]\n", + " [0 1 0 1]]\n", + "{'ironhack': 1, 'cool': 0, 'love': 2, 'student': 3}\n" + ] + } + ], + "source": [ + "vector = CountVectorizer(stop_words = \"english\")\n", + "print(vector.fit_transform(corpus).todense())\n", + "print(vector.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0859d23..3da4c22 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -42,7 +42,7 @@ "'Durante un tiempo no estuvo segura de si su marido era su marido.'" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -77,7 +77,7 @@ " 'broccoli']" ] }, - "execution_count": 24, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -92,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -101,7 +101,7 @@ "'bananas chocolate bread diapers ice cream brownie mix broccoli'" ] }, - "execution_count": 77, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -123,7 +123,7 @@ "['bananas', 'bread', 'brownie', 'broccoli']" ] }, - "execution_count": 31, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -145,7 +145,7 @@ "'bananas, bread, brownie, broccoli.'" ] }, - "execution_count": 33, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -166,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -175,7 +175,7 @@ "78.53981633974483" ] }, - "execution_count": 35, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -234,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -725,14 +725,14 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['Some', 'say', 'the', 'world', 'will', 'end', 'in', 'fire', 'Some', 'say', 'in', 'ice', 'From', 'what', 'I’ve', 'tasted', 'of', 'desire', 'I', 'hold', 'with', 'those', 'who', 'favor', 'fire', 'But', 'if', 'it', 'had', 'to', 'perish', 'twice', 'I', 'think', 'I', 'know', 'enough', 'of', 'hate', 'To', 'say', 'that', 'for', 'destruction', 'ice', 'Is', 'also', 'great', 'And', 'would', 'suffice']\n" + "['some', 'say', 'the', 'world', 'will', 'end', 'in', 'fire', 'some', 'say', 'in', 'ice', 'from', 'what', 'i’ve', 'tasted', 'of', 'desire', 'i', 'hold', 'with', 'those', 'who', 'favor', 'fire', 'but', 'if', 'it', 'had', 'to', 'perish', 'twice', 'i', 'think', 'i', 'know', 'enough', 'of', 'hate', 'to', 'say', 'that', 'for', 'destruction', 'ice', 'is', 'also', 'great', 'and', 'would', 'suffice']\n" ] } ], @@ -740,7 +740,7 @@ "# split the string into a list of strings using the space delimiter\n", "string_list = re.split(\"[,.\\s\\n\\r]\", poem)\n", "\n", - "string_list = [word for word in string_list if word != '']\n", + "string_list = [word.lower() for word in string_list if word != '']\n", "print(string_list)" ] }, @@ -757,14 +757,14 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'Some': 2, 'say': 3, 'the': 1, 'world': 1, 'will': 1, 'end': 1, 'in': 2, 'fire': 2, 'ice': 2, 'From': 1, 'what': 1, 'I’ve': 1, 'tasted': 1, 'of': 2, 'desire': 1, 'I': 3, 'hold': 1, 'with': 1, 'those': 1, 'who': 1, 'favor': 1, 'But': 1, 'if': 1, 'it': 1, 'had': 1, 'to': 1, 'perish': 1, 'twice': 1, 'think': 1, 'know': 1, 'enough': 1, 'hate': 1, 'To': 1, 'that': 1, 'for': 1, 'destruction': 1, 'Is': 1, 'also': 1, 'great': 1, 'And': 1, 'would': 1, 'suffice': 1}\n" + "{'some': 2, 'say': 3, 'the': 1, 'world': 1, 'will': 1, 'end': 1, 'in': 2, 'fire': 2, 'ice': 2, 'from': 1, 'what': 1, 'i’ve': 1, 'tasted': 1, 'of': 2, 'desire': 1, 'i': 3, 'hold': 1, 'with': 1, 'those': 1, 'who': 1, 'favor': 1, 'but': 1, 'if': 1, 'it': 1, 'had': 1, 'to': 2, 'perish': 1, 'twice': 1, 'think': 1, 'know': 1, 'enough': 1, 'hate': 1, 'that': 1, 'for': 1, 'destruction': 1, 'is': 1, 'also': 1, 'great': 1, 'and': 1, 'would': 1, 'suffice': 1}\n" ] } ], @@ -782,7 +782,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -813,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -822,7 +822,7 @@ "'i was angry with my friend; \\ni told my wrath, my wrath did end.\\ni was angry with my foe: \\ni told it not, my wrath did grow. \\n\\nand i waterd it in fears,\\nnight & morning with my tears: \\nand i sunned it with smiles,\\nand with soft deceitful wiles. \\n\\nand it grew both day and night. \\ntill it bore an apple bright. \\nand my foe beheld it shine,\\nand he knew that it was mine. \\n\\nand into my garden stole, \\nwhen the night had veild the pole; \\nin the morning glad i see; \\nmy foe outstretched beneath the tree.'" ] }, - "execution_count": 81, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -880,7 +880,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -892,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 18, "metadata": {}, "outputs": [ { diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index cedc6fa..b8cb010 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -97,7 +97,7 @@ "['Ironhack is cool.', 'I love Ironhack.', 'I am a student at Ironhack.']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -109,6 +109,7 @@ "for i in range(1,4):\n", " with open(f'doc{i}.txt', 'r') as file:\n", " data = file.read().rstrip() # valid only if text in file is a single line\n", + " #rstrip() reads from the RIGHT\n", " corpus += [data]\n", "\n", "corpus" @@ -123,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -161,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -170,7 +171,7 @@ "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -197,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -217,103 +218,47 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "corpus" + "# import re\n", + "# corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", + "# corpus" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "[['ironhack', 'is', 'cool'],\n", - " ['i', 'love', 'ironhack'],\n", - " ['i', 'am', 'a', 'student', 'at', 'ironhack']]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import re\n", - "corpus = [re.split(\"[,.;:\\s\\n\\r]\", string) for string in corpus]\n", - "corpus" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 'is', 'cool']\n", + "['i', 'love', 'ironhack']\n", + "['i', 'am', 'a', 'student', 'at', 'ironhack']\n" + ] + }, { "data": { "text/plain": [ - "['ironhack',\n", - " 'is',\n", - " 'cool',\n", - " 'i',\n", - " 'love',\n", - " 'ironhack',\n", - " 'i',\n", - " 'am',\n", - " 'a',\n", - " 'student',\n", - " 'at',\n", - " 'ironhack']" + "['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']" ] }, - "execution_count": 51, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "bag_of_words = [string for lista in corpus for string in lista]\n", - "bag_of_words" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "argument of type 'NoneType' is not iterable", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "Input \u001b[1;32mIn [53]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m lista \u001b[38;5;129;01min\u001b[39;00m corpus:\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m string \u001b[38;5;129;01min\u001b[39;00m lista:\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mstring\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mbag_of_words\u001b[49m:\n\u001b[0;32m 5\u001b[0m bag_of_words \u001b[38;5;241m=\u001b[39m bag_of_words\u001b[38;5;241m.\u001b[39mappend(word)\n\u001b[0;32m 7\u001b[0m bag_of_words\n", - "\u001b[1;31mTypeError\u001b[0m: argument of type 'NoneType' is not iterable" - ] - } - ], - "source": [ - "# Write your code here\n", - "for lista in corpus:\n", - " for string in lista:\n", - " if string in bag_of_words:\n", - " bag_of_words = bag_of_words.append(word)\n", + "for doc in corpus:\n", + " words = doc.split()\n", + " print(words)\n", + " for word in words:\n", + " if word not in bag_of_words:\n", + " bag_of_words.append(word)\n", "\n", "bag_of_words" ] @@ -329,23 +274,6 @@ "If not, fix your code in the previous cell." ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], - "source": [ - "print(bag_of_words)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -357,11 +285,31 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 1, 0, 0, 0, 0, 0, 0]\n", + "[1, 0, 0, 1, 1, 0, 0, 0, 0]\n", + "[1, 0, 0, 1, 0, 1, 1, 1, 1]\n", + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" + ] + } + ], "source": [ "term_freq = []\n", "\n", - "# Write your code here" + "# Write your code here\n", + "\n", + "for i in range(len(corpus)):\n", + " temp_list =[]\n", + " for word in bag_of_words:\n", + " conteo = corpus[i].split().count(word)\n", + " temp_list.append(conteo)\n", + " print(temp_list)\n", + " term_freq.append(temp_list)\n", + "print(term_freq)" ] }, { @@ -382,7 +330,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[]\n" + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" ] } ], @@ -425,13 +373,70 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n" + ] + } + ], "source": [ "stop_words = ['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n", - "\n", + "print(stop_words)\n", "# Write your code below\n" ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 'cool', 'love', 'student']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bag_of_words = [word for word in bag_of_words if word not in stop_words]\n", + "bag_of_words" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 0, 0]\n", + "[1, 0, 1, 0]\n", + "[1, 0, 0, 1]\n", + "[[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]]\n" + ] + } + ], + "source": [ + "term_freq = []\n", + "for i in range(len(corpus)):\n", + " temp_list =[]\n", + " for word in bag_of_words:\n", + " conteo = corpus[i].split().count(word)\n", + " temp_list.append(conteo)\n", + " print(temp_list)\n", + " term_freq.append(temp_list)\n", + "print(term_freq)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -460,6 +465,83 @@ " [1 1 0 1 0 0 1]]\n", " ```" ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']\n" + ] + } + ], + "source": [ + "print(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0 0 1 1 1 0 0]\n", + " [0 0 0 1 0 1 0]\n", + " [1 1 0 1 0 0 1]]\n", + "{'ironhack': 3, 'is': 4, 'cool': 2, 'love': 5, 'am': 0, 'student': 6, 'at': 1}\n" + ] + } + ], + "source": [ + "vector = CountVectorizer()\n", + "print(vector.fit_transform(corpus).todense())\n", + "print(vector.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1 1 0 0]\n", + " [0 1 1 0]\n", + " [0 1 0 1]]\n", + "{'ironhack': 1, 'cool': 0, 'love': 2, 'student': 3}\n" + ] + } + ], + "source": [ + "vector = CountVectorizer(stop_words = \"english\")\n", + "print(vector.fit_transform(corpus).todense())\n", + "print(vector.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {