From 790a87d7b74925e0a85b9d8d061612f202a29072 Mon Sep 17 00:00:00 2001 From: Tatiana <74679787+ailinnesse@users.noreply.github.com> Date: Sat, 16 May 2026 17:01:35 +0100 Subject: [PATCH 1/4] Tatiana's lab 1 challenge --- .../tatiana_patrusheva/lab1.ipynb | 692 ++++++++++++++++++ 1 file changed, 692 insertions(+) create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/lab1.ipynb diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/lab1.ipynb b/part1-fundementals/community-contributions/tatiana_patrusheva/lab1.ipynb new file mode 100644 index 0000000..50911da --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/lab1.ipynb @@ -0,0 +1,692 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "96520fca", + "metadata": {}, + "source": [ + "# Basic Agentic Workflow\n", + "\n", + "Helloooo everyone and welcome to an exciting lesson on Agentic AI! 🎉\n", + "\n", + "Today, we're diving into **prompt chaining** agentic workflow pattern. What's that? It's just passing the output from one LLM to the next, step by step. Think of it like a relay race, but with prompts instead of batons.\n", + "\n", + "We'll keep things super simple: manually run each cell, watch the magic happen, and see how chaining LLM calls lets us build more complex workflows.\n", + "\n", + "Ready to see how agents can work together? Let's get started!" + ] + }, + { + "cell_type": "markdown", + "id": "655eac71", + "metadata": {}, + "source": [ + "## As always, libraries first!" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8f696afb", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "from IPython.display import display, Markdown\n", + "\n", + "\n", + "load_dotenv()\n", + "\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "GEMINI_API_KEY = os.getenv(\"GEMINI_API_KEY\")\n", + "ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n", + "\n", + "# check if API keys are set\n", + "if not OPENAI_API_KEY:\n", + " raise ValueError(\"Missing OpenAI API key\")\n", + "if not GEMINI_API_KEY:\n", + " raise ValueError(\"Missing Gemini API key\")\n", + "if not ANTHROPIC_API_KEY:\n", + " raise ValueError(\"Missing Anthropic API key\")" + ] + }, + { + "cell_type": "markdown", + "id": "5bab264f", + "metadata": {}, + "source": [ + "You can set your API Keys for each of the LLM providers using the following links:\n", + "\n", + "- [OpenAI](https://platform.openai.com/api-keys)\n", + "- [Anthropic](https://console.anthropic.com/settings/keys)\n", + "- [Gemini](https://aistudio.google.com/app/apikey)\n", + "\n", + "Once you have created the API Keys, you can store them on your `.env` file at the root of this repo" + ] + }, + { + "cell_type": "markdown", + "id": "91357479", + "metadata": {}, + "source": [ + "
\n", + " Info:\n", + " \n", + "
đź’ˇ
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "df6ed0b8", + "metadata": {}, + "source": [ + "## The Workflow" + ] + }, + { + "cell_type": "markdown", + "id": "01562382", + "metadata": {}, + "source": [ + "```mermaid\n", + "graph LR\n", + " A[Generate Tickets] --> B[Classify Priority] --> C[Respond to Tickets]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "4a8641ab", + "metadata": {}, + "source": [ + "## Lets start with using OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "24530e43", + "metadata": {}, + "outputs": [], + "source": [ + "# client\n", + "openai_client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "11ab44e3", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "message = \"I want you to generate customer support ticket for a 3rd party re-seller. \"\n", + "message += \"The ticket should be a single sentence describing a common issue a customer might face with their product or service. \"\n", + "message += \"Please ensure the ticket is varied and covers different types of problems. \"\n", + "message += \"Do not give any subjects, only the body of the ticket.\"\n", + "\n", + "messages = [{\"role\": \"user\", \"content\": message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2aa74d83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated Ticket:\n", + "I am unable to log into my account despite entering the correct credentials multiple times." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "openai_response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=messages\n", + ")\n", + "\n", + "ticket = openai_response.choices[0].message.content\n", + "# print(f\"### Generated Ticket:\\n{ticket}\")\n", + "display(Markdown(f\"### Generated Ticket:\\n{ticket}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "67ec94e3", + "metadata": {}, + "source": [ + "I love markdown. It is a lightweight method of rendering and formatting text that is super versatile without having to use heavy softwares like MS Word or Google Docs.\n", + "\n", + "You can learn more about markdown sytanx [here](https://www.markdownguide.org/basic-syntax/)\n", + "\n", + "A really informative YouTube video talking about the [Unreasonable Effectiveness of Plain Text](https://www.youtube.com/watch?v=WgV6M1LyfNY)" + ] + }, + { + "cell_type": "markdown", + "id": "38015b29", + "metadata": {}, + "source": [ + "## Lets pass these on to an Anthropic model and ask it to classify the priority level of each ticket" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "42d3443c", + "metadata": {}, + "outputs": [], + "source": [ + "# anthropic client, pass in base_url\n", + "anthropic_client = OpenAI(api_key=ANTHROPIC_API_KEY, base_url=\"https://api.anthropic.com/v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4affb2ef", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "message = \"I want you to classify the priority of the following customer support ticket. \"\n", + "message += \"The ticket is as follows: \"+ ticket + \" \"\n", + "message += \"Please classify the priority as either 'Low', 'Medium', or 'High'. \"\n", + "message += \"Respond with only the priority level.\"\n", + "\n", + "messages = [{\"role\": \"user\", \"content\": message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0e7c7c08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated priority:\n", + "High" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "openai_response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=messages\n", + ")\n", + "\n", + "priority = openai_response.choices[0].message.content\n", + "# print(f\"### Generated Ticket:\\n{ticket}\")\n", + "display(Markdown(f\"### Generated priority:\\n{priority}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "ed235e15", + "metadata": {}, + "source": [ + "## Now Gemini should determine the appropriate response" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5c2304b7", + "metadata": {}, + "outputs": [], + "source": [ + "# gemini client\n", + "gemini_client = OpenAI(api_key=GEMINI_API_KEY, base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9b123501", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "\n", + "message = \"You are to determine an appropriate response to the following customer support ticket. \"\n", + "message += \"The ticket is as follows: \"+ ticket + \" \"\n", + "message += \"The priority level of this ticket is: \" + priority + \" \"\n", + "message += \"Please provide a response that addresses the customer's issue in a short and concise manner. \"\n", + " \n", + "messages = [{\"role\": \"user\", \"content\": message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "07a1928f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated Response:\n", + "Hello,\n", + "\n", + "We understand you're unable to log into your account despite entering the correct credentials, and we're treating this with high priority.\n", + "\n", + "To help us investigate and resolve this immediately, please reply with your registered email address or username.\n", + "\n", + "We will look into this right away and get back to you within 30 minutes with an update or resolution." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "gemini_response = gemini_client.chat.completions.create(\n", + " model=\"gemini-2.5-flash\",\n", + " messages=messages\n", + ")\n", + "\n", + "response = gemini_response.choices[0].message.content\n", + "display(Markdown(f\"### Generated Response:\\n{response}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "be180d46", + "metadata": {}, + "source": [ + "
\n", + " Your Challenge:\n", + "
    \n", + "
  • Recreate the customer support ticket workflow using an evaluator-optimizer agentic workflow pattern instead of prompt chaining.
  • \n", + "
  • Your evaluator agent should assess the quality and completeness of each ticket and suggest improvements.
  • \n", + "
  • Your optimizer agent should revise the tickets based on evaluator feedback, aiming for clarity and actionable details.
  • \n", + "
  • Try to implement this using at least two LLM calls (one for evaluation, one for optimization) and display the before/after results.
  • \n", + "
  • Share your work in the community-contributions folder by creating a folder with your name. Eg. shaheer-airaj.
  • \n", + " \n", + "
    đź’Ş
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "b0c5f7d2", + "metadata": {}, + "source": [ + "# Now we need to evaluate each step and get LLMs to improve the ticket and responce" + ] + }, + { + "cell_type": "markdown", + "id": "e4053954", + "metadata": {}, + "source": [ + "## We will use gemini to evaluate OpenAI work and OpenAI to evaluate gemini work" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "8520fbf1", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate ticket created by OpenAI using Gemini \n", + "evaluation_message = f\"\"\"\n", + "You are evaluating the quality of a customer support ticket generated by another model.\n", + "\n", + "Original task:\n", + "Generate a customer support ticket for a 3rd-party reseller.\n", + "\n", + "Requirements:\n", + "I want you to generate customer support ticket for a 3rd party re-seller. \n", + "The ticket should be a single sentence describing a common issue a customer might face with their product or service. \n", + "Please ensure the ticket is varied and covers different types of problems. \n", + "Do not give any subjects, only the body of the ticket.\n", + "\n", + "Generated ticket:\n", + "{ticket}\n", + "\n", + "Evaluate the ticket for:\n", + "1. Clarity\n", + "2. Completeness\n", + "3. Accuracy against the task requirements\n", + "\n", + "Be concise and direct.\n", + "If improvements are needed, list them.\n", + "If the ticket is good, say so briefly.\n", + "\"\"\"\n", + "\n", + "\n", + "evaluation_messages = [{\"role\": \"user\", \"content\": evaluation_message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "05ac8e55", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated Response:\n", + "**1. Clarity:** Good. The issue is clearly stated, along with relevant context (correct credentials, multiple attempts).\n", + "**2. Completeness:** Good. For a single sentence, it provides sufficient detail to understand the core problem.\n", + "**3. Accuracy against the task requirements:** Good.\n", + " * It is a single sentence.\n", + " * It describes a common issue.\n", + " * It does not include a subject.\n", + " * The issue described is generic enough to apply to a product or service offered by a 3rd-party reseller.\n", + "\n", + "The ticket is good." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "gemini_response = gemini_client.chat.completions.create(\n", + " model=\"gemini-2.5-flash\",\n", + " messages=evaluation_messages\n", + ")\n", + "\n", + "response = gemini_response.choices[0].message.content\n", + "display(Markdown(f\"### Generated Response:\\n{response}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c7ea30f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Optimize ticket created by OpenAI using Gemini \n", + "optimization_message = f\"\"\"\n", + "You are the optimizer in an evaluator-optimizer workflow.\n", + "\n", + "Original task:\n", + "Generate a customer support ticket for a 3rd-party reseller.\n", + "\n", + "Requirements:\n", + "I want you to generate customer support ticket for a 3rd party re-seller. \n", + "The ticket should be a single sentence describing a common issue a customer might face with their product or service. \n", + "Please ensure the ticket is varied and covers different types of problems. \n", + "Do not give any subjects, only the body of the ticket.\n", + "\n", + "Generated ticket:\n", + "{ticket}\n", + "You need to base your optimization on the evaluation of the ticket comleted by another model.\n", + "Evaluation response:\n", + "{response}\n", + "\n", + "Your job:\n", + "Assess whether the generated ticket needs improvement based on the evaluator feedback.\n", + "\n", + "If improvement is needed, rewrite the ticket so it better satisfies the original task.\n", + "If no improvement is needed, return the original ticket unchanged.\n", + "\n", + "Return only the final ticket text.\n", + "Do not explain your reasoning.\n", + "Do not include labels such as \"Optimized ticket:\".\n", + "\"\"\"\n", + "\n", + "optimization_messages = [{\"role\": \"user\", \"content\": optimization_message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b81208d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated Response:\n", + "I am unable to log into my account despite entering the correct credentials multiple times." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "gemini_response = gemini_client.chat.completions.create(\n", + " model=\"gemini-2.5-flash\",\n", + " messages=optimization_messages\n", + ")\n", + "\n", + "updated_ticket = gemini_response.choices[0].message.content\n", + "display(Markdown(f\"### Generated Response:\\n{updated_ticket}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "15f50f1e", + "metadata": {}, + "source": [ + "## In my example the ticket did not change. Lets see if the responce to the ticket will change\n", + "here we will evaluate gemini work using OpenAI " + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4f87ea83", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate ticket created by OpenAI using Gemini \n", + "response_evaluation_message = f\"\"\"\n", + "You are evaluating the quality of a customer support ticket responce generated by another model.\n", + "\n", + "Original task:\n", + "You are to determine an appropriate response to the following customer support ticket.\n", + "The ticket is as follows: {ticket}\n", + "The priority level of this ticket is: {priority}\n", + "Please provide a response that addresses the customer's issue in a short and concise manner.\n", + "\n", + "Generated response:\n", + "{response}\n", + "\n", + "\n", + "Evaluate the ticket response for:\n", + "1. Clarity\n", + "2. Completeness\n", + "3. Accuracy against the task requirements\n", + "\n", + "Be concise and direct.\n", + "If improvements are needed, list them.\n", + "If the ticket response is good, say so briefly.\n", + "\"\"\"\n", + "\n", + "response_evaluation_messages = [{\"role\": \"user\", \"content\": response_evaluation_message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "25f83000", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Generated priority:\n", + "The ticket response is good. \n", + "\n", + "1. **Clarity:** Good. The issue is clearly articulated.\n", + "2. **Completeness:** Good. Sufficient detail is provided within a concise format.\n", + "3. **Accuracy against the task requirements:** Good. It aligns well with the task requirements. \n", + "\n", + "No improvements are needed." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "openai_response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=response_evaluation_messages\n", + ")\n", + "\n", + "response_evaluation = openai_response.choices[0].message.content\n", + "# print(f\"### Generated Ticket:\\n{ticket}\")\n", + "display(Markdown(f\"### Generated priority:\\n{response_evaluation}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "a35ee634", + "metadata": {}, + "outputs": [], + "source": [ + "# Optimize ticket response created by Gemini using OpenAI \n", + "response_optimization_message = f\"\"\"\n", + "You are the optimizer in an evaluator-optimizer workflow.\n", + "\n", + "Original task:\n", + "You are to determine an appropriate response to the following customer support ticket.\n", + "The ticket is as follows: {ticket}\n", + "The priority level of this ticket is: {priority}\n", + "Please provide a response that addresses the customer's issue in a short and concise manner.\n", + "\n", + "Generated response:\n", + "{response}\n", + "You need to base your optimization on the evaluation of the ticket comleted by another model.\n", + "Evaluation response:\n", + "{response_evaluation}\n", + "\n", + "Your job:\n", + "Assess whether the generated ticket response needs improvement based on the evaluator feedback.\n", + "\n", + "If improvement is needed, rewrite the ticket response so it better satisfies the original task.\n", + "If no improvement is needed, return the original ticket response unchanged.\n", + "\n", + "Return only the final ticket response.\n", + "Do not explain your reasoning.\n", + "Do not include labels such as \"Optimized ticket:\".\n", + "\"\"\"\n", + "\n", + "response_optimization_messages = [{\"role\": \"user\", \"content\": response_optimization_message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "a9ac40bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Optimized Response:\n", + "Thank you for reaching out. We're sorry to hear you're having trouble logging into your account. Please try resetting your password using the \"Forgot Password\" link on the login page. If the issue persists, let us know, and we'll assist you further." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# response\n", + "\n", + "openai_response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=response_optimization_messages\n", + ")\n", + "\n", + "response_optimization = openai_response.choices[0].message.content\n", + "# print(f\"### Generated Ticket:\\n{ticket}\")\n", + "display(Markdown(f\"### Optimized Response:\\n{response_optimization}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "79c076f4", + "metadata": {}, + "source": [ + "This model decided to change the responce, even so the evaluator said that the original is good.\n", + "I personaly prefer original to the new one" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From d0465eaa69b0174e7e0f899038702c2d86cc3b5d Mon Sep 17 00:00:00 2001 From: Tatiana <74679787+ailinnesse@users.noreply.github.com> Date: Sat, 16 May 2026 21:16:07 +0100 Subject: [PATCH 2/4] Tatiana's Lab 2 Challenge --- .../tatiana_patrusheva/lab2.ipynb | 358 ++++++++++++++++++ 1 file changed, 358 insertions(+) create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/lab2.ipynb diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/lab2.ipynb b/part1-fundementals/community-contributions/tatiana_patrusheva/lab2.ipynb new file mode 100644 index 0000000..d3db0a2 --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/lab2.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "76aa0469", + "metadata": {}, + "source": [ + "# Structured Outputs\n", + "\n", + "LLMs regurgitate out text and that is great for so many applications. But in order to build strong, robust systems and applications, we need to make sense of the chaos sometimes by receiving a pre-determined structured output everytime an LLM is called." + ] + }, + { + "cell_type": "markdown", + "id": "4a54393e", + "metadata": {}, + "source": [ + "## As always, libraries first!" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "07683da2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "from IPython.display import display, Markdown\n", + "\n", + "\n", + "load_dotenv()\n", + "\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "GEMINI_API_KEY = os.getenv(\"GEMINI_API_KEY\")\n", + "ANTHROPIC_API_KEY = os.getenv(\"ANTHROPIC_API_KEY\")\n", + "\n", + "# check if API keys are set\n", + "if not OPENAI_API_KEY:\n", + " raise ValueError(\"Missing OpenAI API key\")\n", + "if not GEMINI_API_KEY:\n", + " raise ValueError(\"Missing Gemini API key\")\n", + "if not ANTHROPIC_API_KEY:\n", + " raise ValueError(\"Missing Anthropic API key\")" + ] + }, + { + "cell_type": "markdown", + "id": "e567598a", + "metadata": {}, + "source": [ + "## The Workflow" + ] + }, + { + "cell_type": "markdown", + "id": "043a7425", + "metadata": {}, + "source": [ + "```mermaid\n", + "graph LR\n", + " A[Generate Ticket] --> B[Respond to Ticket]\n", + " B --> C[Evaluate Response]\n", + " C --> B\n", + " C --> D[Final Output]\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "92b1446b", + "metadata": {}, + "source": [ + "## Creating Classes for LLMs responses" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d92a7c30", + "metadata": {}, + "outputs": [], + "source": [ + "# classes\n", + "from pydantic import BaseModel\n", + "\n", + "class CustomerReview(BaseModel):\n", + " review: str\n", + " rating: int\n", + "\n", + "class ReviewResponse(BaseModel):\n", + " response: str\n", + " \n", + "\n", + "class ResponseEvaluation(BaseModel):\n", + " passed: bool\n", + " feedback: str" + ] + }, + { + "cell_type": "markdown", + "id": "eec10603", + "metadata": {}, + "source": [ + "## Calling Gemini to generate review tickets" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9639d8bf", + "metadata": {}, + "outputs": [], + "source": [ + "# client\n", + "client = OpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "509f3c0f", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "user_message = \"I want you to generate a review for the home decore website. \"\n", + "user_message += \"The review should be a single sentence describing any product you can find in home decor. \"\n", + "user_message += \"Please ensure review has a rating - an int from 1 to 5 and it matches the tone of the review (positive review - 4 or 5, negative review - 1 or 2, neutral review - 3).\"\n", + "\n", + "messages = [{\"role\": \"user\", \"content\": user_message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "fccede81", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Structured Review:\n", + "review='The artisan-crafted ceramic vase adds a charming touch to my living room decor, and I absolutely love its vibrant colors.' rating=5" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "structured_review = client.chat.completions.parse(\n", + " model=\"gpt-4.1-nano\",\n", + " messages=messages,\n", + " response_format=CustomerReview\n", + ")\n", + "\n", + "structured_review = structured_review.choices[0].message.parsed\n", + "display(Markdown(f\"### Structured Review:\\n{structured_review}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "9e222b10", + "metadata": {}, + "source": [ + "## Responding to the review" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "10d734a5", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "message = \"You are to answer the following customer review. Make sure to check the rating and answer accordingly (positive review - 4 or 5, negative review - 1 or 2, neutral review - 3).\\n\\n\"\n", + "message += f\"Review: {structured_review.review}\\n\"\n", + "message += f\"Rating: {structured_review.rating}\\n\\n\"\n", + "\n", + "messages = [{\"role\": \"user\", \"content\": message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "bc87ad25", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Response:\n", + "Thank you for your wonderful feedback! We're delighted to hear that you love the artisan-crafted ceramic vase and that it adds a charming touch to your living room. Your satisfaction means a lot to us. If you ever need more decor ideas or assistance, feel free to reach out!" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# structured response\n", + "review_response = client.chat.completions.parse(\n", + " model=\"gpt-4.1-nano\",\n", + " messages=messages,\n", + " response_format=ReviewResponse\n", + ")\n", + "\n", + "review_response = review_response.choices[0].message.parsed\n", + "display(Markdown(f\"### Response:\\n{review_response.response}\"))\n" + ] + }, + { + "cell_type": "markdown", + "id": "26c5907b", + "metadata": {}, + "source": [ + "## Lets evaluate our response" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "2db24768", + "metadata": {}, + "outputs": [], + "source": [ + "# messages list\n", + "message = \"You are to evaluate the response for the following customer review. \"\n", + "message += \"You will determine if the proposed response is appropriate for the review and rating. \"\n", + "message += f\"Review: {structured_review.review}\\n\"\n", + "message += f\"Rating: {structured_review.rating}\\n\\n\"\n", + "message += f\"Proposed Response: {review_response.response}\\n\"\n", + "\n", + "messages = [{\"role\": \"user\", \"content\": message}]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "02b53969", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Passed:\n", + "True" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Feedback:\n", + "The response appropriately acknowledges the customer's positive review, expresses gratitude, and reinforces their satisfaction. It also offers further assistance, which is good customer service. Overall, the response is suitable for a 5-star review." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# evaluate response\n", + "evaluator_response = client.chat.completions.parse(\n", + " model=\"gpt-4.1-nano\",\n", + " messages=messages,\n", + " response_format=ResponseEvaluation\n", + ")\n", + "\n", + "evaluator_response = evaluator_response.choices[0].message.parsed\n", + "display(Markdown(f\"### Passed:\\n{evaluator_response.passed}\"))\n", + "display(Markdown(f\"### Feedback:\\n{evaluator_response.feedback}\"))" + ] + }, + { + "cell_type": "markdown", + "id": "45b589c6", + "metadata": {}, + "source": [ + "
    \n", + " Your Challenge:\n", + "
      \n", + "
    • Hey everyone! Ready to flex those agentic muscles? 🎉 Build a workflow just like the ticket system above, but for product reviews!
    • \n", + "
    • Your workflow should:\n", + "
        \n", + "
      • Generate a product review (think: electronics, books, or your favorite kitchen gadget)
      • \n", + "
      • Respond to the review (company reply, moderation, or a witty bot response)
      • \n", + "
      • Evaluate the response (is it helpful, polite, and on point?)
      • \n", + "
      \n", + "
    • \n", + "
    • Use structured outputs and Pydantic models for each step, just like we did above.
    • \n", + "
    • Include an evaluator step to assess the quality of the response.
    • \n", + "
    • Here’s a suggested workflow to get your creative gears turning:
    • \n", + "
    \n", + "
    đź’Ş
    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "b609d884", + "metadata": {}, + "source": [ + "### Suggested Workflow\n", + "\n", + "```mermaid\n", + "graph LR\n", + " A[Generate Review] --> B[Respond to Review]\n", + " B --> C[Evaluate Response]\n", + " C --> B\n", + " C --> D[Final Output]\n", + "```\n", + "\n", + "Try to use structured outputs and Pydantic models for each step, just like in the notebook above. Include an evaluator step to assess the quality of the response." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fc0c3d28b3cdb86cf1eeb2207ba998958e67069f Mon Sep 17 00:00:00 2001 From: Tatiana <74679787+ailinnesse@users.noreply.github.com> Date: Sun, 17 May 2026 10:01:45 +0100 Subject: [PATCH 3/4] Lab 3 Challenge: Subreddit Summarization --- .../tatiana_patrusheva/lab3.ipynb | 778 ++++++++++++++++++ 1 file changed, 778 insertions(+) create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/lab3.ipynb diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/lab3.ipynb b/part1-fundementals/community-contributions/tatiana_patrusheva/lab3.ipynb new file mode 100644 index 0000000..5163d51 --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/lab3.ipynb @@ -0,0 +1,778 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1a650a62", + "metadata": {}, + "source": [ + "# Summarizing Yesterday's Reddit Subreddit Posts\n", + "\n", + "This project fetches yesterday's posts from a selected Reddit subreddit and uses OpenAI to generate a concise summary of the main topics, themes, and notable discussions." + ] + }, + { + "cell_type": "markdown", + "id": "3aa669de", + "metadata": {}, + "source": [ + "## Step 1: Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6bffb0b8", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "from dotenv import load_dotenv\n", + "from IPython.display import Markdown, display\n", + "\n", + "load_dotenv()\n", + "\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "\n", + "if OPENAI_API_KEY is None:\n", + " raise Exception(\"API key is missing\")" + ] + }, + { + "cell_type": "markdown", + "id": "9d3d86cc", + "metadata": {}, + "source": [ + "## Step 2: Define a tool - Subreddit reader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3997d169", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from datetime import datetime, timedelta, timezone\n", + "\n", + "# This function retrieves posts published yesterday from a specified Reddit subreddit. It currently checks the latest 100 posts by default, but you can change the `limit` parameter to search through more or fewer posts.\n", + "def get_yesterday_reddit_posts(subreddit_name, limit=100):\n", + " url = f\"https://www.reddit.com/r/{subreddit_name}/new.json?limit={limit}\"\n", + "\n", + " headers = {\n", + " \"User-Agent\": \"agentic-ai-lab-by-tatiana/0.1\"\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers)\n", + " response.raise_for_status()\n", + "\n", + " data = response.json()\n", + "\n", + " now = datetime.now(timezone.utc)\n", + "\n", + " yesterday_start = datetime(\n", + " year=now.year,\n", + " month=now.month,\n", + " day=now.day,\n", + " tzinfo=timezone.utc\n", + " ) - timedelta(days=1)\n", + "\n", + " yesterday_end = yesterday_start + timedelta(days=1)\n", + "\n", + " posts = []\n", + "\n", + " for item in data[\"data\"][\"children\"]:\n", + " post = item[\"data\"]\n", + " post_time = datetime.fromtimestamp(post[\"created_utc\"], tz=timezone.utc)\n", + "\n", + " if yesterday_start <= post_time < yesterday_end:\n", + " posts.append({\n", + " \"title\": post.get(\"title\", \"\"),\n", + " \"text\": post.get(\"selftext\", \"\"),\n", + " \"score\": post.get(\"score\", 0),\n", + " \"comments\": post.get(\"num_comments\", 0)\n", + " })\n", + "\n", + " return posts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831ae344", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the posts from MicrosoftFabric subreddit from yesterday\n", + "posts_fabric = get_yesterday_reddit_posts(\"MicrosoftFabric\", limit=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfddda44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'title': '95 GB data warehouse - Azure SQL DB in Fabric make sense.',\n", + " 'text': \"We're migrating to Fabric from a current setup that is a SQL Server Data warehouse, largely fed by a Synapse Datalake containing Delta-Parquet, Parquet, and CSV files. \\n\\nFor the Datawarehouse part, given the relatively low data volume, sticking with a RDMS is appealing vs. Delta Parquet storage. \\n\\nThat said, thoughts/ feedback?\",\n", + " 'score': 8,\n", + " 'comments': 13},\n", + " {'title': 'Starting our new Fabric environment - help steer with Fabric Link',\n", + " 'text': 'Hi all,\\n\\nGreat community in here and so good to see Microsoft employees engaging so often with threads.\\n\\nAnyway, I am basically the project lead on landing our new Fabric environment - for context our main requirement is analytics from D365 F&O but we have quite a range of data sources at the moment so will be good to bring everything into the Fabric capacity. We currently use BYOD into Azure SQL Database. I’m a SQL DBA but more than happy to put on many different hats and enjoying this so far especially as it’s effectively from scratch so can implement best practices.\\n\\nI was keen on Fabric Link from the low code aspect, I’m not an admin on the power platform side and have been on screen shares with the admin there trying to sort Fabric Link to no success.\\n\\nOn looking at what was there already, I was concerned that there were already almost 1000 tables in Dataverse without us even considering adding the F&O tables we would need.\\n\\nI was attempting to use the workspace identity for the connection (have added this to system administrator on power platform) but it didn’t show up as an option, only a service principal or organizational account.\\n\\nThe main issue was that the link couldn’t be created due to an unknown error. Our Fabric capacity is in UKS but Dataverse is UKWS - I understand that this was previously a restriction but no longer?\\n\\nI’m hoping to start again on Monday with fresh eyes but should I seriously consider Synapse Link instead?\\n\\nThe documentation around this can be patchy and because it’s a fast moving product, what I have been reading may well be out of date!\\n\\nWould appreciate some guidance or pointers, thanks in advance',\n", + " 'score': 3,\n", + " 'comments': 4},\n", + " {'title': 'How to lock a file in #onelake',\n", + " 'text': \"How do you lock a file in OneLake? Turns out it's straightforward \\n \\n OneLake exposes the ADLS Gen2 Lease API, so you get the same acquire/renew/release semantics as any Azure Storage blob. \\n \\nThe API is genuinely pleasant to work with : 60-second leases, infinite leases, conditional headers, all the primitives you'd want. \\n \\nPlenty of other use cases too: any time you need mutual exclusion over a file in the lake without standing up an external lock service. \\n \\nI have used it to harden a ducklake catalog database hosted in the Files section: acquire an exclusive lease before opening the DB for write, renew while the job runs, release on exit. \",\n", + " 'score': 14,\n", + " 'comments': 0},\n", + " {'title': 'May update Power BI gone?',\n", + " 'text': \"Has the May update for Power BI Desktop been pulled? I can't seem to find the blog anymore and the old link doesn't work anymore.\",\n", + " 'score': 6,\n", + " 'comments': 7},\n", + " {'title': 'Is there an alpha testing environments for developers',\n", + " 'text': \"For some time now, I have been taking part in Fabric User Interviews. Just wondering if there's a website where I can register for alpha/beta testing, get access to all the newest features, and give the Microsoft team feedback. As my reasoning is testing and learning, I am unable to ask my tenant admin to change the settings whenever a new preview feature is introduced, and in most cases, he may deny my request. Additionally, this makes space for alpha or unreleased versions so that developers can get a hands-on experience.\\n\\n\",\n", + " 'score': 6,\n", + " 'comments': 8}]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Display the posts retrieved\n", + "posts_fabric" + ] + }, + { + "cell_type": "markdown", + "id": "cc4b3e24", + "metadata": {}, + "source": [ + "## Step 3: Compare Function Calling with a Standard Chat Model\n", + "\n", + "This step is included only for comparison. It demonstrates the difference between using a function call and asking a standard chat model directly." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "53cc1597", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm sorry, but I can't access or retrieve real-time content from external websites like Reddit. However, if you can provide the text of the posts, I can help summarize them for you.\n" + ] + } + ], + "source": [ + "client = OpenAI()\n", + "\n", + "response = client.responses.create(\n", + " model=\"gpt-4.1-nano\",\n", + " input=[\n", + " {\"role\": \"user\", \"content\": \"Summarize the Reddit posts from MicrosoftFabric subreddit posted yesterday.\"}\n", + " ]\n", + ")\n", + "\n", + "print(response.output_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac7d64a0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'resp_02c80578bb486175006a097db3f8cc8196ab1334b0434a175e',\n", + " 'created_at': 1779006899.0,\n", + " 'error': None,\n", + " 'incomplete_details': None,\n", + " 'instructions': None,\n", + " 'metadata': {},\n", + " 'model': 'gpt-4.1-nano-2025-04-14',\n", + " 'object': 'response',\n", + " 'output': [ResponseOutputMessage(id='msg_02c80578bb486175006a097db5c1788196aa9f7600738e5d6a', content=[ResponseOutputText(annotations=[], text=\"I'm sorry, but I can't access or retrieve real-time content from external websites like Reddit. However, if you can provide the text of the posts, I can help summarize them for you.\", type='output_text', logprobs=[])], role='assistant', status='completed', type='message')],\n", + " 'parallel_tool_calls': True,\n", + " 'temperature': 1.0,\n", + " 'tool_choice': 'auto',\n", + " 'tools': [],\n", + " 'top_p': 1.0,\n", + " 'background': False,\n", + " 'conversation': None,\n", + " 'max_output_tokens': None,\n", + " 'max_tool_calls': None,\n", + " 'previous_response_id': None,\n", + " 'prompt': None,\n", + " 'prompt_cache_key': None,\n", + " 'prompt_cache_retention': 'in_memory',\n", + " 'reasoning': Reasoning(effort=None, generate_summary=None, summary=None),\n", + " 'safety_identifier': None,\n", + " 'service_tier': 'default',\n", + " 'status': 'completed',\n", + " 'text': ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'),\n", + " 'top_logprobs': 0,\n", + " 'truncation': 'disabled',\n", + " 'usage': ResponseUsage(input_tokens=20, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=39, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=59),\n", + " 'user': None,\n", + " '_request_id': 'req_b1e711be5cbc480599d2d468dc9ffac7'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 'output': [ResponseOutputMessage\n", + "response.__dict__" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3c75fa00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'msg_02c80578bb486175006a097db5c1788196aa9f7600738e5d6a',\n", + " 'content': [ResponseOutputText(annotations=[], text=\"I'm sorry, but I can't access or retrieve real-time content from external websites like Reddit. However, if you can provide the text of the posts, I can help summarize them for you.\", type='output_text', logprobs=[])],\n", + " 'role': 'assistant',\n", + " 'status': 'completed',\n", + " 'type': 'message'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response.output[0].__dict__" + ] + }, + { + "cell_type": "markdown", + "id": "c45ab349", + "metadata": {}, + "source": [ + "## Step 4: Define the Input Schema for the Reddit Tool\n", + "\n", + "This step describes the `get_yesterday_reddit_posts` function as a tool, including the input parameters OpenAI can use when deciding whether to call it." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e9ead465", + "metadata": {}, + "outputs": [], + "source": [ + "tools = [{\n", + " 'type': 'function',\n", + " 'name': 'get_yesterday_reddit_posts',\n", + " 'description': 'Fetch yesterday\\'s Reddit posts from a specific subreddit',\n", + " 'parameters': {\n", + " 'type': 'object',\n", + " 'properties': {\n", + " 'subreddit_name': {'type': 'string'},\n", + " 'limit': {'type': 'integer', 'default': 100}\n", + " },\n", + " 'required': ['subreddit_name', 'limit'],\n", + " 'additionalProperties': False\n", + " },\n", + " 'strict': True\n", + "}]" + ] + }, + { + "cell_type": "markdown", + "id": "98a9f2e8", + "metadata": {}, + "source": [ + "## Step 5: Pass the tool schema over to the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce5e10f3", + "metadata": {}, + "outputs": [], + "source": [ + "# this is a message we want LLM to process to get the arguments for the tool call\n", + "input_messages = [{\"role\": \"user\", \"content\": \"Summarize the Reddit posts from MicrosoftFabric subreddit posted yesterday.\"}]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3ecd57bb", + "metadata": {}, + "outputs": [], + "source": [ + "response = client.responses.create(\n", + " model=\"gpt-4.1-nano\",\n", + " input=input_messages,\n", + " tools=tools\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "651e424c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# LLM does not provide the text output at this point, it only provides the arguments for the tool call\n", + "print(response.output_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fe27b30", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'resp_0d029abacfe00c8d006a097fc02d3c81959152bd6377e06e42',\n", + " 'created_at': 1779007424.0,\n", + " 'error': None,\n", + " 'incomplete_details': None,\n", + " 'instructions': None,\n", + " 'metadata': {},\n", + " 'model': 'gpt-4.1-nano-2025-04-14',\n", + " 'object': 'response',\n", + " 'output': [ResponseFunctionToolCall(arguments='{\"subreddit_name\":\"MicrosoftFabric\",\"limit\":100}', call_id='call_I18we0h8mrgEstyFyba3D33l', name='get_yesterday_reddit_posts', type='function_call', id='fc_0d029abacfe00c8d006a097fc1d6048195b1890d15350c947e', status='completed')],\n", + " 'parallel_tool_calls': True,\n", + " 'temperature': 1.0,\n", + " 'tool_choice': 'auto',\n", + " 'tools': [FunctionTool(name='get_yesterday_reddit_posts', parameters={'type': 'object', 'properties': {'subreddit_name': {'type': 'string'}, 'limit': {'type': 'integer', 'default': 100}}, 'required': ['subreddit_name', 'limit'], 'additionalProperties': False}, strict=True, type='function', description=\"Fetch yesterday's Reddit posts from a specific subreddit\")],\n", + " 'top_p': 1.0,\n", + " 'background': False,\n", + " 'conversation': None,\n", + " 'max_output_tokens': None,\n", + " 'max_tool_calls': None,\n", + " 'previous_response_id': None,\n", + " 'prompt': None,\n", + " 'prompt_cache_key': None,\n", + " 'prompt_cache_retention': 'in_memory',\n", + " 'reasoning': Reasoning(effort=None, generate_summary=None, summary=None),\n", + " 'safety_identifier': None,\n", + " 'service_tier': 'default',\n", + " 'status': 'completed',\n", + " 'text': ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'),\n", + " 'top_logprobs': 0,\n", + " 'truncation': 'disabled',\n", + " 'usage': ResponseUsage(input_tokens=71, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=42, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=113),\n", + " 'user': None,\n", + " '_request_id': 'req_366afcfae45b4a3e806a0c9ad457afc7'}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#[ResponseFunctionToolCall\n", + "response.__dict__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49ba891f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'arguments': '{\"subreddit_name\":\"MicrosoftFabric\",\"limit\":100}',\n", + " 'call_id': 'call_I18we0h8mrgEstyFyba3D33l',\n", + " 'name': 'get_yesterday_reddit_posts',\n", + " 'type': 'function_call',\n", + " 'id': 'fc_0d029abacfe00c8d006a097fc1d6048195b1890d15350c947e',\n", + " 'status': 'completed'}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This is a function call to get_yesterday_reddit_posts with the parameters specified \n", + "response.output[0].__dict__" + ] + }, + { + "cell_type": "markdown", + "id": "c4fea85f", + "metadata": {}, + "source": [ + "## Step 6: Format the tool call response from the LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f4619e76", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "tool_call = response.output[0]\n", + "args = json.loads(tool_call.arguments)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fbeef1c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ResponseFunctionToolCall(arguments='{\"subreddit_name\":\"MicrosoftFabric\",\"limit\":100}', call_id='call_I18we0h8mrgEstyFyba3D33l', name='get_yesterday_reddit_posts', type='function_call', id='fc_0d029abacfe00c8d006a097fc1d6048195b1890d15350c947e', status='completed')\n" + ] + } + ], + "source": [ + "print(tool_call)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68f003b2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'subreddit_name': 'MicrosoftFabric', 'limit': 100}\n" + ] + } + ], + "source": [ + "# LLM parsed the request and decided to call the tool with the following arguments\n", + "print(args)" + ] + }, + { + "cell_type": "markdown", + "id": "df9ff324", + "metadata": {}, + "source": [ + "## Step 7: Pass on the tool call arguments to our tool/python function\n", + "\n", + "We now need to pass on the arguments received by the model to our python function or tool" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "dec267d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'title': '95 GB data warehouse - Azure SQL DB in Fabric make sense.',\n", + " 'text': \"We're migrating to Fabric from a current setup that is a SQL Server Data warehouse, largely fed by a Synapse Datalake containing Delta-Parquet, Parquet, and CSV files. \\n\\nFor the Datawarehouse part, given the relatively low data volume, sticking with a RDMS is appealing vs. Delta Parquet storage. \\n\\nThat said, thoughts/ feedback?\",\n", + " 'score': 8,\n", + " 'comments': 13},\n", + " {'title': 'Starting our new Fabric environment - help steer with Fabric Link',\n", + " 'text': 'Hi all,\\n\\nGreat community in here and so good to see Microsoft employees engaging so often with threads.\\n\\nAnyway, I am basically the project lead on landing our new Fabric environment - for context our main requirement is analytics from D365 F&O but we have quite a range of data sources at the moment so will be good to bring everything into the Fabric capacity. We currently use BYOD into Azure SQL Database. I’m a SQL DBA but more than happy to put on many different hats and enjoying this so far especially as it’s effectively from scratch so can implement best practices.\\n\\nI was keen on Fabric Link from the low code aspect, I’m not an admin on the power platform side and have been on screen shares with the admin there trying to sort Fabric Link to no success.\\n\\nOn looking at what was there already, I was concerned that there were already almost 1000 tables in Dataverse without us even considering adding the F&O tables we would need.\\n\\nI was attempting to use the workspace identity for the connection (have added this to system administrator on power platform) but it didn’t show up as an option, only a service principal or organizational account.\\n\\nThe main issue was that the link couldn’t be created due to an unknown error. Our Fabric capacity is in UKS but Dataverse is UKWS - I understand that this was previously a restriction but no longer?\\n\\nI’m hoping to start again on Monday with fresh eyes but should I seriously consider Synapse Link instead?\\n\\nThe documentation around this can be patchy and because it’s a fast moving product, what I have been reading may well be out of date!\\n\\nWould appreciate some guidance or pointers, thanks in advance',\n", + " 'score': 3,\n", + " 'comments': 4},\n", + " {'title': 'How to lock a file in #onelake',\n", + " 'text': \"How do you lock a file in OneLake? Turns out it's straightforward \\n \\n OneLake exposes the ADLS Gen2 Lease API, so you get the same acquire/renew/release semantics as any Azure Storage blob. \\n \\nThe API is genuinely pleasant to work with : 60-second leases, infinite leases, conditional headers, all the primitives you'd want. \\n \\nPlenty of other use cases too: any time you need mutual exclusion over a file in the lake without standing up an external lock service. \\n \\nI have used it to harden a ducklake catalog database hosted in the Files section: acquire an exclusive lease before opening the DB for write, renew while the job runs, release on exit. \",\n", + " 'score': 14,\n", + " 'comments': 1},\n", + " {'title': 'May update Power BI gone?',\n", + " 'text': \"Has the May update for Power BI Desktop been pulled? I can't seem to find the blog anymore and the old link doesn't work anymore.\",\n", + " 'score': 6,\n", + " 'comments': 7},\n", + " {'title': 'Is there an alpha testing environments for developers',\n", + " 'text': \"For some time now, I have been taking part in Fabric User Interviews. Just wondering if there's a website where I can register for alpha/beta testing, get access to all the newest features, and give the Microsoft team feedback. As my reasoning is testing and learning, I am unable to ask my tenant admin to change the settings whenever a new preview feature is introduced, and in most cases, he may deny my request. Additionally, this makes space for alpha or unreleased versions so that developers can get a hands-on experience.\\n\\n\",\n", + " 'score': 6,\n", + " 'comments': 8}]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = get_yesterday_reddit_posts(args['subreddit_name'], args['limit'])\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "e1829435", + "metadata": {}, + "source": [ + "## Step 8: Append the response of the tool into the message list" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cffcaa12", + "metadata": {}, + "outputs": [], + "source": [ + "input_messages.append(tool_call)\n", + "\n", + "input_messages.append({\n", + " \"type\": \"function_call_output\",\n", + " \"call_id\": tool_call.call_id,\n", + " \"output\": str(result)\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1298efe2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'content': 'Summarize the Reddit posts from MicrosoftFabric subreddit posted '\n", + " 'yesterday.',\n", + " 'role': 'user'},\n", + " ResponseFunctionToolCall(arguments='{\"subreddit_name\":\"MicrosoftFabric\",\"limit\":100}', call_id='call_I18we0h8mrgEstyFyba3D33l', name='get_yesterday_reddit_posts', type='function_call', id='fc_0d029abacfe00c8d006a097fc1d6048195b1890d15350c947e', status='completed'),\n", + " {'call_id': 'call_I18we0h8mrgEstyFyba3D33l',\n", + " 'output': \"[{'title': '95 GB data warehouse - Azure SQL DB in Fabric make \"\n", + " 'sense.\\', \\'text\\': \"We\\'re migrating to Fabric from a current '\n", + " 'setup that is a SQL Server Data warehouse, largely fed by a '\n", + " 'Synapse Datalake containing Delta-Parquet, Parquet, and CSV '\n", + " 'files. \\\\n\\\\nFor the Datawarehouse part, given the relatively low '\n", + " 'data volume, sticking with a RDMS is appealing vs. Delta Parquet '\n", + " 'storage. \\\\n\\\\nThat said, thoughts/ feedback?\", \\'score\\': 8, '\n", + " \"'comments': 13}, {'title': 'Starting our new Fabric environment - \"\n", + " \"help steer with Fabric Link', 'text': 'Hi all,\\\\n\\\\nGreat \"\n", + " 'community in here and so good to see Microsoft employees engaging '\n", + " 'so often with threads.\\\\n\\\\nAnyway, I am basically the project '\n", + " 'lead on landing our new Fabric environment - for context our main '\n", + " 'requirement is analytics from D365 F&O but we have quite a '\n", + " 'range of data sources at the moment so will be good to bring '\n", + " 'everything into the Fabric capacity. We currently use BYOD into '\n", + " 'Azure SQL Database. I’m a SQL DBA but more than happy to put on '\n", + " 'many different hats and enjoying this so far especially as it’s '\n", + " 'effectively from scratch so can implement best practices.\\\\n\\\\nI '\n", + " 'was keen on Fabric Link from the low code aspect, I’m not an '\n", + " 'admin on the power platform side and have been on screen shares '\n", + " 'with the admin there trying to sort Fabric Link to no '\n", + " 'success.\\\\n\\\\nOn looking at what was there already, I was '\n", + " 'concerned that there were already almost 1000 tables in Dataverse '\n", + " 'without us even considering adding the F&O tables we would '\n", + " 'need.\\\\n\\\\nI was attempting to use the workspace identity for the '\n", + " 'connection (have added this to system administrator on power '\n", + " 'platform) but it didn’t show up as an option, only a service '\n", + " 'principal or organizational account.\\\\n\\\\nThe main issue was that '\n", + " 'the link couldn’t be created due to an unknown error. Our Fabric '\n", + " 'capacity is in UKS but Dataverse is UKWS - I understand that this '\n", + " 'was previously a restriction but no longer?\\\\n\\\\nI’m hoping to '\n", + " 'start again on Monday with fresh eyes but should I seriously '\n", + " 'consider Synapse Link instead?\\\\n\\\\nThe documentation around this '\n", + " 'can be patchy and because it’s a fast moving product, what I have '\n", + " 'been reading may well be out of date!\\\\n\\\\nWould appreciate some '\n", + " \"guidance or pointers, thanks in advance', 'score': 3, 'comments': \"\n", + " \"4}, {'title': 'How to lock a file in #onelake', 'text': \"\n", + " '\"How do you lock a file in OneLake? Turns out it\\'s '\n", + " 'straightforward \\\\n \\\\n OneLake exposes the ADLS Gen2 Lease '\n", + " 'API, so you get the same acquire/renew/release semantics as any '\n", + " 'Azure Storage blob. \\\\n \\\\nThe API is genuinely pleasant to '\n", + " 'work with : 60-second leases, infinite leases, conditional '\n", + " \"headers, all the primitives you'd want. \\\\n \\\\nPlenty of other \"\n", + " 'use cases too: any time you need mutual exclusion over a file in '\n", + " 'the lake without standing up an external lock service. \\\\n \\\\nI '\n", + " 'have used it to harden a ducklake catalog database hosted in the '\n", + " 'Files section: acquire an exclusive lease before opening the DB '\n", + " 'for write, renew while the job runs, release on exit. \", '\n", + " \"'score': 14, 'comments': 1}, {'title': 'May update Power BI \"\n", + " 'gone?\\', \\'text\\': \"Has the May update for Power BI Desktop been '\n", + " \"pulled? I can't seem to find the blog anymore and the old link \"\n", + " 'doesn\\'t work anymore.\", \\'score\\': 6, \\'comments\\': 7}, '\n", + " \"{'title': 'Is there an alpha testing environments for \"\n", + " 'developers\\', \\'text\\': \"For some time now, I have been taking '\n", + " \"part in Fabric User Interviews. Just wondering if there's a \"\n", + " 'website where I can register for alpha/beta testing, get access '\n", + " 'to all the newest features, and give the Microsoft team feedback. '\n", + " 'As my reasoning is testing and learning, I am unable to ask my '\n", + " 'tenant admin to change the settings whenever a new preview '\n", + " 'feature is introduced, and in most cases, he may deny my request. '\n", + " 'Additionally, this makes space for alpha or unreleased versions '\n", + " 'so that developers can get a hands-on experience.\\\\n\\\\n\", '\n", + " \"'score': 6, 'comments': 8}]\",\n", + " 'type': 'function_call_output'}]\n" + ] + } + ], + "source": [ + "# Pritty print the messages to see the input messages\n", + "from pprint import pprint\n", + "pprint(input_messages)" + ] + }, + { + "cell_type": "markdown", + "id": "09a76b0c", + "metadata": {}, + "source": [ + "## Step 9: Pass the message list into the model" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "2444c531", + "metadata": {}, + "outputs": [], + "source": [ + "response_2 = client.responses.create(\n", + " model=\"gpt-4.1-nano\",\n", + " input=input_messages,\n", + " tools=tools\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fcba22f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is a summary of the Reddit posts from the MicrosoftFabric subreddit posted yesterday:\n", + "\n", + "1. A discussion about the data size and storage options in Fabric, specifically regarding a 95 GB data warehouse and whether to use Azure SQL DB or Delta Parquet storage.\n", + "2. A post from a project lead sharing their experience and challenges in setting up a new Fabric environment, especially related to Fabric Link and Dataverse integration.\n", + "3. An informational post explaining how to lock files in OneLake using the ADLS Gen2 Lease API, emphasizing its usefulness for mutual exclusion over files.\n", + "4. A query about the status of the May update for Power BI Desktop, with concerns about whether it was pulled.\n", + "5. A question from a developer about availability of alpha/beta testing environments for new Fabric features, seeking ways to access unreleased versions for testing and feedback.\n", + "\n", + "Would you like a more detailed summary or insights into any specific post?\n" + ] + } + ], + "source": [ + "# Here we expect LLM to provide the summary of the posts retrieved from the subreddit specified in the user message\n", + "print(response_2.output_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f2b735c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'resp_0d029abacfe00c8d006a0980970f3081958159e2da4269c850',\n", + " 'created_at': 1779007639.0,\n", + " 'error': None,\n", + " 'incomplete_details': None,\n", + " 'instructions': None,\n", + " 'metadata': {},\n", + " 'model': 'gpt-4.1-nano-2025-04-14',\n", + " 'object': 'response',\n", + " 'output': [ResponseOutputMessage(id='msg_0d029abacfe00c8d006a09809755148195b5fb00dc86a268e7', content=[ResponseOutputText(annotations=[], text='Here is a summary of the Reddit posts from the MicrosoftFabric subreddit posted yesterday:\\n\\n1. A discussion about the data size and storage options in Fabric, specifically regarding a 95 GB data warehouse and whether to use Azure SQL DB or Delta Parquet storage.\\n2. A post from a project lead sharing their experience and challenges in setting up a new Fabric environment, especially related to Fabric Link and Dataverse integration.\\n3. An informational post explaining how to lock files in OneLake using the ADLS Gen2 Lease API, emphasizing its usefulness for mutual exclusion over files.\\n4. A query about the status of the May update for Power BI Desktop, with concerns about whether it was pulled.\\n5. A question from a developer about availability of alpha/beta testing environments for new Fabric features, seeking ways to access unreleased versions for testing and feedback.\\n\\nWould you like a more detailed summary or insights into any specific post?', type='output_text', logprobs=[])], role='assistant', status='completed', type='message')],\n", + " 'parallel_tool_calls': True,\n", + " 'temperature': 1.0,\n", + " 'tool_choice': 'auto',\n", + " 'tools': [FunctionTool(name='get_yesterday_reddit_posts', parameters={'type': 'object', 'properties': {'subreddit_name': {'type': 'string'}, 'limit': {'type': 'integer', 'default': 100}}, 'required': ['subreddit_name', 'limit'], 'additionalProperties': False}, strict=True, type='function', description=\"Fetch yesterday's Reddit posts from a specific subreddit\")],\n", + " 'top_p': 1.0,\n", + " 'background': False,\n", + " 'conversation': None,\n", + " 'max_output_tokens': None,\n", + " 'max_tool_calls': None,\n", + " 'previous_response_id': None,\n", + " 'prompt': None,\n", + " 'prompt_cache_key': None,\n", + " 'prompt_cache_retention': 'in_memory',\n", + " 'reasoning': Reasoning(effort=None, generate_summary=None, summary=None),\n", + " 'safety_identifier': None,\n", + " 'service_tier': 'default',\n", + " 'status': 'completed',\n", + " 'text': ResponseTextConfig(format=ResponseFormatText(type='text'), verbosity='medium'),\n", + " 'top_logprobs': 0,\n", + " 'truncation': 'disabled',\n", + " 'usage': ResponseUsage(input_tokens=989, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=185, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=1174),\n", + " 'user': None,\n", + " '_request_id': 'req_28b2333c430b411db4f692b9c39f1b68'}" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#'output': [ResponseOutputMessage\n", + "response_2.__dict__" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 3f7a3623a7422c18ebe7dfbf25f381682e1b11fb Mon Sep 17 00:00:00 2001 From: Tatiana <74679787+ailinnesse@users.noreply.github.com> Date: Mon, 18 May 2026 22:31:14 +0100 Subject: [PATCH 4/4] My Reddit summarization as an app for the Lab4 and deployment to Hugging Face --- .../tatiana_patrusheva/app.py | 540 ++++++++ .../tatiana_patrusheva/lab4.ipynb | 1233 +++++++++++++++++ .../tatiana_patrusheva/requirements.txt | 4 + .../tatiana_patrusheva/utils.py | 289 ++++ 4 files changed, 2066 insertions(+) create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/app.py create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/lab4.ipynb create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/requirements.txt create mode 100644 part1-fundementals/community-contributions/tatiana_patrusheva/utils.py diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/app.py b/part1-fundementals/community-contributions/tatiana_patrusheva/app.py new file mode 100644 index 0000000..5281d0f --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/app.py @@ -0,0 +1,540 @@ +import os +from datetime import datetime, timedelta, timezone + +import requests +import gradio as gr +from dotenv import load_dotenv +from openai import OpenAI + + +# ------------------------------------------------------------ +# Environment setup +# ------------------------------------------------------------ + +load_dotenv() + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + +if not OPENAI_API_KEY: + raise ValueError( + "OPENAI_API_KEY is not set. Add it to your .env file or deployment secrets." + ) + +openai_client = OpenAI(api_key=OPENAI_API_KEY) + + +# ------------------------------------------------------------ +# Reddit post retrieval +# ------------------------------------------------------------ + +def subreddit_search( + subreddit: str, + date: str = "yesterday", + limit: int = 100 +) -> str: + """ + Fetches Reddit posts from a specific subreddit and returns posts from the requested date, + including title, text, score, number of comments, creation time, and URL. + + Args: + subreddit (str): Subreddit name without r/, for example "MicrosoftFabric". + date (str): Date to search for. Supports "yesterday" or YYYY-MM-DD. + limit (int): Number of latest posts to check. Uses pagination if limit > 100. + + Returns: + str: A formatted string containing matching Reddit posts. + """ + + subreddit = subreddit.replace("r/", "").strip() + + if not subreddit: + return "No subreddit was provided." + + # Reddit returns timestamps in UTC, so we compare dates in UTC. + now = datetime.now(timezone.utc) + + if date.lower().strip() == "yesterday": + target_start = datetime( + year=now.year, + month=now.month, + day=now.day, + tzinfo=timezone.utc + ) - timedelta(days=1) + else: + target_start = datetime.strptime(date.strip(), "%Y-%m-%d").replace( + tzinfo=timezone.utc + ) + + target_end = target_start + timedelta(days=1) + + headers = { + # User-Agent helps Reddit identify your script. + # It is not authentication, but Reddit may reject requests without it. + "User-Agent": "agentic-ai-lab-subreddit-summarizer/0.1 by Tatiana" + } + + matching_posts = [] + fetched_count = 0 + after = None + + # Reddit usually returns max 100 posts per request. + # This loop allows the app to check more than 100 posts if the user increases the limit. + while fetched_count < limit: + batch_size = min(100, limit - fetched_count) + + url = f"https://www.reddit.com/r/{subreddit}/new.json?limit={batch_size}" + + if after: + url += f"&after={after}" + + response = requests.get(url, headers=headers, timeout=20) + + # Give a clearer message for common Reddit/API issues. + if response.status_code == 404: + return f"Subreddit r/{subreddit} was not found." + if response.status_code == 403: + return f"Access to r/{subreddit} is forbidden. The subreddit may be private or restricted." + if response.status_code == 429: + return "Reddit rate limit reached. Please wait a bit and try again." + + response.raise_for_status() + + data = response.json() + children = data.get("data", {}).get("children", []) + + if not children: + break + + for item in children: + post = item.get("data", {}) + + post_time = datetime.fromtimestamp( + post.get("created_utc", 0), + tz=timezone.utc + ) + + if target_start <= post_time < target_end: + matching_posts.append({ + "title": post.get("title", ""), + "text": post.get("selftext", ""), + "score": post.get("score", 0), + "num_comments": post.get("num_comments", 0), + "url": "https://www.reddit.com" + post.get("permalink", ""), + "created_utc": post_time.strftime("%Y-%m-%d %H:%M UTC") + }) + + fetched_count += len(children) + after = data.get("data", {}).get("after") + + if not after: + break + + if not matching_posts: + return f"No posts found in r/{subreddit} for {date}." + + # Sort by discussion level first, then by score. + matching_posts = sorted( + matching_posts, + key=lambda post: (post["num_comments"], post["score"]), + reverse=True + ) + + formatted_posts = [] + + for i, post in enumerate(matching_posts, start=1): + # Truncate very long post text to avoid sending too much text to OpenAI. + text = post["text"] + + if len(text) > 1000: + text = text[:1000] + "... [truncated]" + + formatted_posts.append( + f""" +Post {i} +Title: {post['title']} +Text: {text} +Score: {post['score']} +Comments: {post['num_comments']} +Created: {post['created_utc']} +URL: {post['url']} +""" + ) + + return "\n\n".join(formatted_posts) + + +# ------------------------------------------------------------ +# OpenAI summarization +# ------------------------------------------------------------ + +def summarize_subreddit_posts( + subreddit: str, + date: str = "yesterday", + preferences: str = "", + limit: int = 100 +) -> str: + """ + Retrieves Reddit posts from a subreddit and summarizes them using OpenAI. + """ + + posts = subreddit_search( + subreddit=subreddit, + date=date, + limit=limit + ) + + if ( + posts.startswith("No posts found") + or posts.startswith("Subreddit") + or posts.startswith("Access") + or posts.startswith("Reddit rate limit") + or posts.startswith("No subreddit") + ): + return posts + + # Avoid sending extremely large text to the model. + max_chars = 30000 + + if len(posts) > max_chars: + posts = posts[:max_chars] + "\n\n[Additional posts were truncated to keep the summary within token limits.]" + + system_message = """ +You are a Reddit subreddit summarization assistant. + +Your job is to summarize Reddit posts retrieved from a subreddit. + +Rules: +- Summarize only the information provided in the retrieved posts. +- Do not invent posts, comments, links, opinions, or trends. +- If the available posts are limited, mention that clearly. +- Keep the summary concise, structured, and easy to read. +- Use the user's preferences only to decide what to emphasize in the summary. +- Give more attention to posts with higher comment counts and higher scores. +""" + + user_message = f""" +Subreddit: r/{subreddit} +Date: {date} +User preferences for the summary: {preferences} + +Retrieved posts: +{posts} + +Please provide: +1. A short overall summary +2. Main topics discussed +3. Most discussed posts +4. Posts with notable scores +5. Repeated issues, questions, or themes +6. A brief takeaway + +When relevant, emphasize the user's preferences: +{preferences} + +Be concise and clear. +Only return the final subreddit summary. +Do not explain the tool call process. +""" + + openai_response = openai_client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + {"role": "system", "content": system_message}, + {"role": "user", "content": user_message} + ] + ) + + return openai_response.choices[0].message.content + + +# ------------------------------------------------------------ +# Gradio styling +# ------------------------------------------------------------ + +custom_css = """ +:root { + --reddit-orange: #ff4500; + --reddit-orange-dark: #d93a00; + --reddit-bg: #fff7f3; + --reddit-card: #ffffff; + --reddit-text: #1c1c1c; + --reddit-muted: #6b7280; +} + +.gradio-container { + background: linear-gradient(135deg, #fff7f3 0%, #fff1eb 45%, #ffffff 100%) !important; + font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; +} + +#app-container { + max-width: 980px; + margin: 0 auto; +} + +#hero { + background: linear-gradient(135deg, #ff4500 0%, #ff7a1a 100%); + color: white; + padding: 32px; + border-radius: 24px; + box-shadow: 0 18px 40px rgba(255, 69, 0, 0.22); + margin-bottom: 24px; +} + +#hero h1 { + font-size: 38px; + margin-bottom: 8px; +} + +#hero p { + font-size: 17px; + opacity: 0.95; + margin-bottom: 0; +} + +.input-card, .output-card { + background: white; + border-radius: 22px; + padding: 22px; + box-shadow: 0 12px 30px rgba(17, 24, 39, 0.08); + border: 1px solid rgba(255, 69, 0, 0.12); +} + +#submit-btn { + background: linear-gradient(135deg, #ff4500 0%, #ff7a1a 100%) !important; + color: white !important; + border: none !important; + border-radius: 16px !important; + font-weight: 700 !important; + font-size: 16px !important; + padding: 12px 18px !important; + box-shadow: 0 10px 22px rgba(255, 69, 0, 0.28) !important; +} + +#submit-btn:hover { + background: linear-gradient(135deg, #d93a00 0%, #ff6500 100%) !important; + transform: translateY(-1px); +} + +#clear-btn { + border-radius: 16px !important; +} + +.gr-textbox textarea, +.gr-textbox input { + border-radius: 14px !important; +} + +.gr-slider { + border-radius: 14px !important; +} + +#tips { + background: #fff1eb; + border-left: 5px solid #ff4500; + padding: 14px 18px; + border-radius: 16px; + color: #3a1d12; + margin-top: 12px; +} + +footer { + visibility: hidden; +} +""" + + +# ------------------------------------------------------------ +# Gradio wrapper +# ------------------------------------------------------------ + +def gradio_summarize_subreddit(subreddit, date, preferences, limit): + """ + Wrapper function used by the Gradio interface. + Handles user input validation and returns the final summary. + """ + + try: + if not subreddit or not subreddit.strip(): + return "Please enter a subreddit name, for example `MicrosoftFabric`." + + if not date or not date.strip(): + date = "yesterday" + + return summarize_subreddit_posts( + subreddit=subreddit, + date=date, + preferences=preferences, + limit=int(limit) + ) + + except ValueError as e: + return f""" +### Date format issue + +Please use either: + +- `yesterday` +- `YYYY-MM-DD`, for example `2026-05-18` + +Error details: `{str(e)}` +""" + + except Exception as e: + return f""" +### Something went wrong + +Error details: + +`{str(e)}` +""" + + +# ------------------------------------------------------------ +# Gradio app +# ------------------------------------------------------------ + +with gr.Blocks( + css=custom_css, + title="Reddit Subreddit Summarizer", + theme=gr.themes.Soft( + primary_hue="orange", + secondary_hue="red", + neutral_hue="slate" + ) +) as demo: + + with gr.Column(elem_id="app-container"): + + gr.HTML( + """ +
    +

    🔥 Reddit Subreddit Summarizer

    +

    + Pick a subreddit, choose a date, and get a concise AI-powered summary of the most discussed posts. +

    +
    + """ + ) + + with gr.Row(): + + with gr.Column(scale=1, elem_classes="input-card"): + gr.Markdown("## Search settings") + + subreddit_input = gr.Textbox( + label="Subreddit", + value="MicrosoftFabric", + placeholder="Example: MicrosoftFabric, PowerBI, datascience", + info="Enter the subreddit name without r/" + ) + + date_input = gr.Textbox( + label="Date", + value="yesterday", + placeholder="yesterday or 2026-05-18", + info="Use 'yesterday' or a date in YYYY-MM-DD format" + ) + + preferences_input = gr.Textbox( + label="Summary preferences", + value="Focus on Power BI, Fabric, semantic models, data engineering, common issues, and highly discussed posts.", + placeholder="Example: focus on technical issues, questions, complaints, tutorials, or highly discussed posts", + lines=5, + info="These preferences are used only by OpenAI when creating the summary" + ) + + limit_input = gr.Slider( + label="Number of latest posts to check", + minimum=10, + maximum=500, + value=100, + step=10, + info="Higher values check more posts but may take longer" + ) + + with gr.Row(): + submit_btn = gr.Button( + "Summarize Subreddit 🚀", + elem_id="submit-btn", + scale=2 + ) + + clear_btn = gr.ClearButton( + components=[ + subreddit_input, + date_input, + preferences_input + ], + value="Clear", + elem_id="clear-btn", + scale=1 + ) + + gr.HTML( + """ +
    + Tip: If the subreddit is very active, increase the post limit to 300–500. + If it is quiet, 100 is usually enough. +
    + """ + ) + + with gr.Column(scale=2, elem_classes="output-card"): + gr.Markdown("## Summary") + + output = gr.Markdown( + value="Your subreddit summary will appear here.", + label="Subreddit Summary" + ) + + gr.Examples( + examples=[ + [ + "MicrosoftFabric", + "yesterday", + "Focus on Power BI, Fabric, semantic models, data engineering, common issues, and highly discussed posts.", + 100 + ], + [ + "PowerBI", + "yesterday", + "Focus on user problems, dashboard performance, DAX, semantic models, and practical tips.", + 200 + ], + [ + "datascience", + "yesterday", + "Focus on career advice, project ideas, machine learning, and beginner questions.", + 200 + ] + ], + inputs=[ + subreddit_input, + date_input, + preferences_input, + limit_input + ], + label="Try an example" + ) + + submit_btn.click( + fn=gradio_summarize_subreddit, + inputs=[ + subreddit_input, + date_input, + preferences_input, + limit_input + ], + outputs=output, + show_progress="full" + ) + + +# Queue enables better request handling and visible loading/progress behaviour. +demo.queue() + +if __name__ == "__main__": + demo.launch( + server_name="0.0.0.0", + server_port=int(os.getenv("PORT", 7860)) + ) \ No newline at end of file diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/lab4.ipynb b/part1-fundementals/community-contributions/tatiana_patrusheva/lab4.ipynb new file mode 100644 index 0000000..7d0e235 --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/lab4.ipynb @@ -0,0 +1,1233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8696980e", + "metadata": {}, + "source": [ + "# # Summarizing Reddit Subreddit Posts\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "d6a81668", + "metadata": {}, + "source": [ + "## Step 1: Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "6933b082", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from tavily import TavilyClient\n", + "from dotenv import load_dotenv\n", + "import json\n", + "from openai import OpenAI\n", + "from utils import function_to_tool\n", + "from IPython.display import display, Markdown\n", + "import gradio as gr\n", + "from datetime import datetime, timedelta, timezone\n", + "import requests\n", + "\n", + "load_dotenv()\n", + "\n", + "TAVILY_API_KEY = os.getenv(\"TAVILY_API_KEY\")\n", + "if not TAVILY_API_KEY:\n", + " raise ValueError(\"TAVILY_API_KEY is not set in the environment variables.\")\n", + "\n", + "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\")\n", + "if not OPENAI_API_KEY:\n", + " raise ValueError(\"OPENAI_API_KEY is not set in the environment variables.\")\n", + "\n", + "tavily_client = TavilyClient()\n", + "openai_client = OpenAI()" + ] + }, + { + "cell_type": "markdown", + "id": "eb4f6b09", + "metadata": {}, + "source": [ + "You can setup your API key here: **[Tavily API Key](https://app.tavily.com/home)**" + ] + }, + { + "cell_type": "markdown", + "id": "3d619380", + "metadata": {}, + "source": [ + "## Step 2: Define our tools\n", + "\n", + "**The tools to define:**\n", + "\n", + "1. Flight Search Tool\n", + "2. Hotel Search Tool" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a6fafc54", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"query\": \"Summarize the MicrosoftFabric subreddit posts from yesterday.\",\n", + " \"follow_up_questions\": null,\n", + " \"answer\": null,\n", + " \"images\": [],\n", + " \"results\": [\n", + " {\n", + " \"url\": \"https://www.reddit.com/r/dataengineering/comments/1jvwwcy/tried_to_roll_out_microsoft_fabric_ended_up/\",\n", + " \"title\": \"Tried to roll out Microsoft Fabric\\u2026 ended up rolling straight into a ...\",\n", + " \"content\": \"Yesterday morning, all capacity in a Microsoft Fabric production environment was completely drained \\u2014 and it's only April. What happened?\",\n", + " \"score\": 0.6519982,\n", + " \"raw_content\": null\n", + " },\n", + " {\n", + " \"url\": \"https://www.reddit.com/r/MicrosoftFabric/comments/1nib2ov/fabric_september_2025_feature_summary_microsoft/\",\n", + " \"title\": \"Fabric September 2025 Feature Summary | Microsoft Fabric Blog\",\n", + " \"content\": \"Great great updates! A lot of things to dig into. Also the new tabbed experience is perfect. Also happy to see schema in lakehouse makes it\",\n", + " \"score\": 0.6301622,\n", + " \"raw_content\": null\n", + " },\n", + " {\n", + " \"url\": \"https://www.reddit.com/r/MicrosoftFabric/comments/1kswg0n/breaking_changes_in_fabric_microsoft_what_did_you/\",\n", + " \"title\": \"Breaking changes in Fabric - Microsoft what did you ship this week?\",\n", + " \"content\": \"I'm drowning this week in issues in our Fabric production environment on F64 this week. They started yesterday.\",\n", + " \"score\": 0.6097339,\n", + " \"raw_content\": null\n", + " },\n", + " {\n", + " \"url\": \"https://www.reddit.com/r/MicrosoftFabric/comments/1ml0o6i/fabric_capacity_metrics_multi_metric_ribbon_chart/\",\n", + " \"title\": \"Fabric Capacity Metrics - Multi metric ribbon chart Not Showing Today\",\n", + " \"content\": \"Aren't all the charts showing up to yesterday ... r/MicrosoftFabric - Fabric March 2026 Feature Summary | Microsoft Fabric Blog | Microsoft Fabric.\",\n", + " \"score\": 0.59401023,\n", + " \"raw_content\": null\n", + " },\n", + " {\n", + " \"url\": \"https://www.reddit.com/r/MicrosoftFabric/comments/1jormwt/fabric_keynote_initial_thoughts_and_followup_blogs/\",\n", + " \"title\": \"Fabric keynote initial thoughts and followup blogs : r/MicrosoftFabric\",\n", + " \"content\": \"As part of a series this week, here's my initial thoughts on the main announcements from yesterday's keynote. I'll be following up with more\",\n", + " \"score\": 0.56690645,\n", + " \"raw_content\": null\n", + " }\n", + " ],\n", + " \"response_time\": 0.86,\n", + " \"request_id\": \"90209090-89bf-4f14-99a0-8b602f8459c9\"\n", + "}\n" + ] + } + ], + "source": [ + "query = \"Summarize the MicrosoftFabric subreddit posts from yesterday.\"\n", + "\n", + "response = tavily_client.search(\n", + " query=query,\n", + " include_domains=[\"reddit.com\"]\n", + ")\n", + "\n", + "print(json.dumps(response, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4cd7550a", + "metadata": {}, + "outputs": [], + "source": [ + "def subreddit_search(query: str) -> str:\n", + " \"\"\"\n", + " Searches for posts in a Reddit subreddit based on the provided query\n", + "\n", + " Args:\n", + " query (str): The search query for subreddit posts\n", + "\n", + " Returns:\n", + " str: A formatted string containing the search results\n", + " \n", + " \"\"\"\n", + "\n", + " response = tavily_client.search(\n", + " query=query,\n", + " include_domains=[\"reddit.com\"]\n", + " )\n", + "\n", + " results = response.get(\"results\", [])\n", + "\n", + " # extract items\n", + " contents = [item.get(\"content\", \"\") for item in results]\n", + "\n", + " # Format as a numbered list\n", + " formatted_contents = \"\\n\".join(f\"{i + 1}. {content}\" for i, content in enumerate(contents) if content)\n", + "\n", + " return formatted_contents" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "561ac014", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. I keep seeing multiple posts here and in r/PowerBI asking the same things about PL-300 and DP-600/DP-700 that can be summarized \"how did you\n", + "2. Anyone that works with data knows one thing - whats important, is reliability. That's it. If something does not work - thats completely fine.\n", + "3. New post where I want to encourage others to think about their Microsoft Fabric Continuous Integration maturity levels.\n", + "4. The new Fabric community blogs are now available. Which you can access by clicking the link below: Fabric community blogs - Microsoft Fabric Community.\n", + "5. Welcome u/aleonard763 !!! love getting more and more of our Data Factory community experts jumping into the forums :).\n" + ] + } + ], + "source": [ + "print(subreddit_search(query))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a945868d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "query = \"What is the most commented post in the MicrosoftFabric subreddit from yesterday and what are the comments about?\"\n", + "\n", + "print(subreddit_search(query))" + ] + }, + { + "cell_type": "markdown", + "id": "456d3f02", + "metadata": {}, + "source": [ + "## Step 3: Define our tools schema" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6b083bf0", + "metadata": {}, + "outputs": [], + "source": [ + "subreddit_tool_schema = function_to_tool(subreddit_search)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4a0f013b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'type': 'function',\n", + " 'name': 'subreddit_search',\n", + " 'description': 'Searches for posts in a Reddit subreddit based on the provided query',\n", + " 'parameters': {'type': 'object',\n", + " 'properties': {'query': {'type': 'string',\n", + " 'description': 'The search query for subreddit posts'}},\n", + " 'required': ['query']}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subreddit_tool_schema" + ] + }, + { + "cell_type": "markdown", + "id": "463f36cc", + "metadata": {}, + "source": [ + "## Step 4: Define a Prompt Template\n", + "\n", + "Prompt templates are discussed in our [Prompt Engineering course](https://github.com/SuperDataScience-Community/prompt-engineering) specifically in the notebook for [multi-shot prompting](https://github.com/SuperDataScience-Community/prompt-engineering/blob/main/prompt-engineering-techniques/multi-shot-prompting.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a2020c3", + "metadata": {}, + "outputs": [], + "source": [ + "# Class to define prompt template\n", + "class PromptTemplate:\n", + " def __init__(self, template: str, input_variables: list[str]):\n", + " self.template = template\n", + " self.input_variables = input_variables\n", + "\n", + " def generate(self, **kwargs) -> str:\n", + " return self.template.format(**{k: kwargs[k] for k in self.input_variables})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56cdfc38", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = PromptTemplate(\n", + " template=\"I want to know about posts in {subreddit} from {date}. the kinds of posts I prefer are {preferences}\",\n", + " input_variables=[\"subreddit\", \"date\", \"preferences\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "27e67bde", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'I want to know about posts in MicrosoftFabric from 2026-05-18. the kinds of posts I prefer are most commented'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt.generate(subreddit=\"MicrosoftFabric\", date=\"2026-05-18\", preferences=\"most commented\")" + ] + }, + { + "cell_type": "markdown", + "id": "d68b134f", + "metadata": {}, + "source": [ + "## Step 5: Call the OpenAI Responses API" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "702abd0b", + "metadata": {}, + "outputs": [], + "source": [ + "system_message = \"\"\"\n", + "You are a Reddit subreddit summarization assistant.\n", + "\n", + "The user will ask for a summary of posts from a specific Reddit subreddit, usually for a specific date such as yesterday.\n", + "\n", + "Your job is to:\n", + "\n", + "1. Use the subreddit search tool to retrieve Reddit posts related to the user's request.\n", + "2. Summarize only the information returned by the tool.\n", + "3. Do not invent posts, comments, links, trends, or opinions that are not present in the tool results.\n", + "4. If the tool returns little or no useful information, clearly say that the available results were limited.\n", + "5. Keep the summary concise, structured, and easy to read.\n", + "6. Focus on the user's stated preferences if they provide any.\n", + "7. Do not ask follow-up questions. Use the information given.\n", + "\n", + "Your final response should include:\n", + "\n", + "- A short overall summary\n", + "- Main topics discussed\n", + "- Notable posts or themes\n", + "- Repeated questions, issues, or complaints\n", + "- A brief takeaway\n", + "\n", + "Only return the final subreddit summary. Do not explain the tool call process.\n", + "Always call the subreddit search tool before answering.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "db0dd7f5", + "metadata": {}, + "outputs": [], + "source": [ + "# user prompt\n", + "\n", + "user_prompt = prompt.generate(\n", + " subreddit=\"MicrosoftFabric\",\n", + " date=\"2026-05-18\",\n", + " preferences=\"most commented\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2371f99e", + "metadata": {}, + "outputs": [], + "source": [ + "# input list\n", + "input_list = [\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_prompt}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "c3484bf0", + "metadata": {}, + "outputs": [], + "source": [ + "# response\n", + "\n", + "response = openai_client.responses.create(\n", + " model=\"gpt-4.1-nano\",\n", + " input=input_list,\n", + " tools=[subreddit_tool_schema],\n", + " tool_choice=\"auto\",\n", + " parallel_tool_calls=False\n", + ")\n", + "\n", + "if response.output[0].type == \"message\":\n", + " input_list.append({\"role\": \"assistant\", \"content\": response.output_text})\n", + "if response.output[0].type == \"function_call\":\n", + " input_list += response.output\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3d228233", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'role': 'system',\n", + " 'content': \"\\nYou are a Reddit subreddit summarization assistant.\\n\\nThe user will ask for a summary of posts from a specific Reddit subreddit, usually for a specific date such as yesterday.\\n\\nYour job is to:\\n\\n1. Use the subreddit search tool to retrieve Reddit posts related to the user's request.\\n2. Summarize only the information returned by the tool.\\n3. Do not invent posts, comments, links, trends, or opinions that are not present in the tool results.\\n4. If the tool returns little or no useful information, clearly say that the available results were limited.\\n5. Keep the summary concise, structured, and easy to read.\\n6. Focus on the user's stated preferences if they provide any.\\n7. Do not ask follow-up questions. Use the information given.\\n\\nYour final response should include:\\n\\n- A short overall summary\\n- Main topics discussed\\n- Notable posts or themes\\n- Repeated questions, issues, or complaints\\n- A brief takeaway\\n\\nOnly return the final subreddit summary. Do not explain the tool call process.\\nAlways call the subreddit search tool before answering.\\n\"},\n", + " {'role': 'user',\n", + " 'content': 'I want to know about posts in MicrosoftFabric from 2026-05-18. the kinds of posts I prefer are most commented'},\n", + " ResponseFunctionToolCall(arguments='{\"query\":\"from:2026-05-18\"}', call_id='call_oklaHkN0AZPWuAgKSrEWnWn5', name='subreddit_search', type='function_call', id='fc_02755207282872f3006a0b7ad8279c819689e05e91ac55b135', status='completed')]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print response\n", + "input_list" + ] + }, + { + "cell_type": "markdown", + "id": "50fe86ea", + "metadata": {}, + "source": [ + "## Step 6: Handle tools calls" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b02e4cc0", + "metadata": {}, + "outputs": [], + "source": [ + "def call_function(name, args):\n", + " if name == \"subreddit_search\":\n", + " return subreddit_search(**args)\n", + " else:\n", + " raise ValueError(f\"Unknown function: {name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "258f3ee7", + "metadata": {}, + "outputs": [], + "source": [ + "name = response.output[0].name\n", + "args = json.loads(response.output[0].arguments)\n", + "\n", + "result = call_function(name, args)\n", + "\n", + "input_list.append({\n", + " 'type': 'function_call_output',\n", + " 'call_id': response.output[0].call_id,\n", + " 'output': str(result)\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bc80fc4b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'role': 'system',\n", + " 'content': \"\\nYou are a Reddit subreddit summarization assistant.\\n\\nThe user will ask for a summary of posts from a specific Reddit subreddit, usually for a specific date such as yesterday.\\n\\nYour job is to:\\n\\n1. Use the subreddit search tool to retrieve Reddit posts related to the user's request.\\n2. Summarize only the information returned by the tool.\\n3. Do not invent posts, comments, links, trends, or opinions that are not present in the tool results.\\n4. If the tool returns little or no useful information, clearly say that the available results were limited.\\n5. Keep the summary concise, structured, and easy to read.\\n6. Focus on the user's stated preferences if they provide any.\\n7. Do not ask follow-up questions. Use the information given.\\n\\nYour final response should include:\\n\\n- A short overall summary\\n- Main topics discussed\\n- Notable posts or themes\\n- Repeated questions, issues, or complaints\\n- A brief takeaway\\n\\nOnly return the final subreddit summary. Do not explain the tool call process.\\nAlways call the subreddit search tool before answering.\\n\"},\n", + " {'role': 'user',\n", + " 'content': 'I want to know about posts in MicrosoftFabric from 2026-05-18. the kinds of posts I prefer are most commented'},\n", + " ResponseFunctionToolCall(arguments='{\"query\":\"from:2026-05-18\"}', call_id='call_oklaHkN0AZPWuAgKSrEWnWn5', name='subreddit_search', type='function_call', id='fc_02755207282872f3006a0b7ad8279c819689e05e91ac55b135', status='completed'),\n", + " {'type': 'function_call_output',\n", + " 'call_id': 'call_oklaHkN0AZPWuAgKSrEWnWn5',\n", + " 'output': \"1. May 18th, 2026 is the 10th Anniversary Day for BAND-MAID's Major Debut. Our maids are posting/celebrating on X/Twitter about the anniversary\\n2. 2026-05-18 (The Infinite Hotel). Self-Promotion. Today's little play, making much use of generative plugins, source over here : https://github\\n3. https://craftword.game May 18, 2026 Round 1: gyms → gems______ → megs______ → mews (-1 ) Round 2: tapping → tupping___ → pupping___\\n4. This thread is for any and all basic gameplay questions and technical issues you may have in order to prevent the subreddit from being\\n5. Hello everyone, welcome to the No Stupid Questions thread. The only stupid questions are the ones left unasked.\"}]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_list" + ] + }, + { + "cell_type": "markdown", + "id": "b3fabc6c", + "metadata": {}, + "source": [ + "## Step 7: The App Logic" + ] + }, + { + "cell_type": "markdown", + "id": "137cf049", + "metadata": {}, + "source": [ + "
    \n", + " Info:\n", + " \n", + " In the video lectures, we saw that our Agent, Atlas was not following the example itinerary properly. This is probably due to the smaller sized model we are using. Try replacing that model with gpt-4o, gpt-5 or if you want to continue with cheaper models, gpt-4o-mini all of which are great at tool use (with gpt-5 obviously outperforming all models in agentic use cases).\n", + " \n", + "
    đź’ˇ
    \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7741b87a", + "metadata": {}, + "outputs": [], + "source": [ + "subreddit_tool_schema = {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"subreddit_search\",\n", + " \"description\": \"Searches for Reddit posts from a specific subreddit and date using Tavily.\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"subreddit\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The subreddit name without r/, for example MicrosoftFabric\"\n", + " },\n", + " \"date\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The date or time period to search for, for example yesterday or 2026-05-18\"\n", + " },\n", + " \"preferences\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"Optional user preferences for the kinds of posts to focus on\"\n", + " }\n", + " },\n", + " \"required\": [\"subreddit\", \"date\", \"preferences\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "253a008a", + "metadata": {}, + "outputs": [], + "source": [ + "def get_response(input_list):\n", + " response = openai_client.responses.create(\n", + " model=\"gpt-4.1-nano\",\n", + " input=input_list,\n", + " tools=[subreddit_tool_schema],\n", + " tool_choice=\"auto\",\n", + " parallel_tool_calls=False\n", + " )\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "aea57614", + "metadata": {}, + "outputs": [], + "source": [ + "def subreddit_search(\n", + " subreddit: str,\n", + " date: str = \"yesterday\",\n", + " limit: int = 100\n", + ") -> str:\n", + " \"\"\"\n", + " Fetches Reddit posts from a specific subreddit and returns posts from the requested date,\n", + " including title, text, score, number of comments, and URL.\n", + "\n", + " Args:\n", + " subreddit (str): The subreddit name without r/, for example MicrosoftFabric.\n", + " date (str): The date to search for. Supports \"yesterday\" or YYYY-MM-DD.\n", + " limit (int): Number of latest posts to check.\n", + "\n", + " Returns:\n", + " str: A formatted string containing matching Reddit posts.\n", + " \"\"\"\n", + "\n", + " subreddit = subreddit.replace(\"r/\", \"\").strip()\n", + "\n", + " url = f\"https://www.reddit.com/r/{subreddit}/new.json?limit={limit}\"\n", + "\n", + " headers = {\n", + " \"User-Agent\": \"agentic-ai-lab-by-tatiana/0.1\"\n", + " }\n", + "\n", + " response = requests.get(url, headers=headers, timeout=20)\n", + " response.raise_for_status()\n", + "\n", + " data = response.json()\n", + "\n", + " now = datetime.now(timezone.utc)\n", + "\n", + " if date.lower().strip() == \"yesterday\":\n", + " target_start = datetime(\n", + " year=now.year,\n", + " month=now.month,\n", + " day=now.day,\n", + " tzinfo=timezone.utc\n", + " ) - timedelta(days=1)\n", + " else:\n", + " target_start = datetime.strptime(date.strip(), \"%Y-%m-%d\").replace(\n", + " tzinfo=timezone.utc\n", + " )\n", + "\n", + " target_end = target_start + timedelta(days=1)\n", + "\n", + " matching_posts = []\n", + "\n", + " for item in data.get(\"data\", {}).get(\"children\", []):\n", + " post = item.get(\"data\", {})\n", + "\n", + " post_time = datetime.fromtimestamp(\n", + " post.get(\"created_utc\", 0),\n", + " tz=timezone.utc\n", + " )\n", + "\n", + " if not (target_start <= post_time < target_end):\n", + " continue\n", + "\n", + " matching_posts.append({\n", + " \"title\": post.get(\"title\", \"\"),\n", + " \"text\": post.get(\"selftext\", \"\"),\n", + " \"score\": post.get(\"score\", 0),\n", + " \"num_comments\": post.get(\"num_comments\", 0),\n", + " \"url\": \"https://www.reddit.com\" + post.get(\"permalink\", \"\"),\n", + " \"created_utc\": post_time.strftime(\"%Y-%m-%d %H:%M UTC\")\n", + " })\n", + "\n", + " if not matching_posts:\n", + " return f\"No posts found in r/{subreddit} for {date}.\"\n", + "\n", + " matching_posts = sorted(\n", + " matching_posts,\n", + " key=lambda post: (post[\"num_comments\"], post[\"score\"]),\n", + " reverse=True\n", + " )\n", + "\n", + " formatted_posts = []\n", + "\n", + " for i, post in enumerate(matching_posts, start=1):\n", + " formatted_posts.append(\n", + " f\"\"\"\n", + "Post {i}\n", + "Title: {post['title']}\n", + "Text: {post['text']}\n", + "Score: {post['score']}\n", + "Comments: {post['num_comments']}\n", + "Created: {post['created_utc']}\n", + "URL: {post['url']}\n", + "\"\"\"\n", + " )\n", + "\n", + " return \"\\n\\n\".join(formatted_posts)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "816a05fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nPost 1\\nTitle: Service entirely down\\nText: After some slowness all day UK South the service now seems to be completely down. Any news as to when it will be back?\\nScore: 51\\nComments: 105\\nCreated: 2026-05-18 15:19 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgps74/service_entirely_down/\\n\\n\\n\\nPost 2\\nTitle: How to code first Batch ELT in Fabric?\\nText: Jumping on the back of posts like these: \\n[Materialized Lake Views: It was too good to be true...](https://www.reddit.com/r/MicrosoftFabric/comments/1tdvxtp/materialized_lake_views_it_was_too_good_to_be_true/) \\n[What\\'s the preferred tool to use for medallion architecture in Fabric Lakehouse?](https://www.reddit.com/r/MicrosoftFabric/comments/1tfw3ct/may_2026_whats_the_preferred_tool_to_use_for/) \\n[Warehouse workflow, what works?](https://www.reddit.com/r/MicrosoftFabric/comments/1sau6u2/warehouse_workflow_what_works/) (My post)\\n\\nI think there is still a void in Fabric when it comes to batch ELT using medallion architecture. \\n\\nI would argue this pattern is the most common pattern for analytics engineering. It is definitely the one being pushed the most by Microsoft in the Fabric documentation.\\n\\nWhat I would like to see:\\n\\n* Ability to develop in your IDE of choice.\\n* Ability to expose the codebase to your AI of choice.\\n* CI/CD and ease of deployment.\\n\\nNow let\\'s look at some of the current ways of doing code first batch ELT in Microsoft Fabric: \\n \\nPySpark notebooks seems like the no. 1. contender at the moment. The Fabric Data Engineering VS Code extension allows local development and remote runtime \\\\*chefs kiss\\\\*, and the [fabric ci/cd package hit v.1.0.0 about a month ago](https://www.reddit.com/r/MicrosoftFabric/comments/1sr2fnq/fabriccicd_v100_is_here_a_major_milestone_with/). However, for true local development the workflow goes like this: Commit local copy to feature workspace -> sync workspace to feature branch -> merge feature branch to main branch -> sync main workspace to main branch. There are so many syncs, and each of them holds the possibility for sync issues and diff resolutions.\\n\\nu/dbrownems, among others, have teased using Github Copilot CLI for developing notebooks, and it does offer a truly local development experience. The same workflow can also be done with any other AI/CLI tool. However I don\\'t see how this can be implemented in a multi-developer ci/cd workflow. \\n \\nThe last month or so I have dipped my toes into using dbt in Fabric, especially after reading the gospel of u/raki_rahman. Warehouse dbt seemed straight forward until the [docs recommended running the project in an Airflow Job](https://learn.microsoft.com/en-us/fabric/data-factory/apache-airflow-jobs-dbt-fabric). Now suddenly this turned into a click-ops job, and the same with dbt job (preview).\\n\\nHow about the spark dbt adapter? I might be convinced, but I\\'m not sure I have the conviction to convince others on what seems like an [all or nothing endeavor](https://www.rakirahman.me/dbt-fabric-spark/).\\n\\nMLVs are little bit of this and a little bit of that, but I do applaud it as the solid middle ground alternative that I feel it is.\\n\\nSo, in conclusion: Are we all gritting our teeth at the curveballs in Fabric and hacking our way through things, either by external tooling or by making amalgamations of the multitude of different tools that do exist in Fabric? \\nIf not, please post your batch ELT workflow.\\n\\n\\n\\nI also want to give a shout out to incremental refresh for being simultaneously simple (just .merge() on your keys) and impossible (apparently mathematically so) at the same time, and truly putting me in the valley of despair of the Dunning-Kruger curve.\\n\\n\\n\\n\\nScore: 8\\nComments: 8\\nCreated: 2026-05-18 11:11 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgjd7d/how_to_code_first_batch_elt_in_fabric/\\n\\n\\n\\nPost 3\\nTitle: Azure Analysis Services on Fabric Lakehouse\\nText: Hi,\\n\\nI have a fabric lakehouse and i need to load that data in my semantic model hosted in Azure Analysis services. But i can\\'t get the authentication done.\\n\\nHow do i need to setup my datasource in my semantic model so i can authenticate to the sql endpoint of the lakehouse?\\nScore: 2\\nComments: 4\\nCreated: 2026-05-18 13:39 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgmynl/azure_analysis_services_on_fabric_lakehouse/\\n\\n\\n\\nPost 4\\nTitle: DP 700 Exam\\nText: Hi everyone,\\n\\nI\\'m planning to book my DP-700 (Fabric Data Engineer Associate) exam and wanted to get some guidance from those who have already cleared it.\\n\\nSpecifically, I\\'d like to know:\\n\\n• Which Fabric components are most heavily tested? (Lakehouses, Warehouses, Pipelines, Spark notebooks?)\\n\\n• How deep does the exam go into Delta Lake and OneLake architecture?\\n\\n• Any specific hands-on experience with Fabric that helped you prepare?\\n\\n• Recommended Microsoft Learn paths or third-party resources?\\n\\nThis is my first Microsoft certification exam, so any tips on exam structure or time management would also be appreciated.\\n\\nThanks in advance!\\nScore: 2\\nComments: 4\\nCreated: 2026-05-18 12:07 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgkm1j/dp_700_exam/\\n\\n\\n\\nPost 5\\nTitle: Fabric Monday 113: Notebook Connection\\nText: ⊕ Your Fabric notebook shouldn\\'t know your password.\\n\\n \\nVideo: [https://www.youtube.com/watch?v=qjUa\\\\_gyBAeM](https://www.youtube.com/watch?v=qjUa_gyBAeM)\\n\\n\\n\\nHardcoding credentials in notebook code is a risk hiding in plain sight.\\n\\nAnyone with access to the notebook can read them.\\n\\n\\n\\nConnection Objects solve this. ✓\\n\\n\\n\\nAuthentication lives inside the Connection Object — not in your code.\\n\\nThe notebook uses the connection. It never sees the credentials.\\n\\n\\n\\nIn this week\\'s Fabric Monday, I show how to wire notebooks to Connection Objects\\n\\nand why this is the right way to handle authentication in Fabric. ▶\\n\\n\\n\\nVideo: [https://www.youtube.com/watch?v=qjUa\\\\_gyBAeM](https://www.youtube.com/watch?v=qjUa_gyBAeM)\\n\\n\\nScore: 11\\nComments: 2\\nCreated: 2026-05-18 08:39 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgghui/fabric_monday_113_notebook_connection/\\n\\n\\n\\nPost 6\\nTitle: Semantic models not refreshing - UK South\\nText: Hi\\n\\nAnyone else seeing CORS errors on schedules sm refreshes? All our sm\\'s are unable to refresh due to the 401 issue. Region is UK South. \\n\\nNot seen any posts about it and unable to raise a support ticket, as the link just goes back to the previous page (lol) so hoping this might help raise awareness...\\n\\nWe have tried diff browsers, incognito, rebooting etc and no luck.\\nScore: 4\\nComments: 2\\nCreated: 2026-05-18 09:26 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tghbz2/semantic_models_not_refreshing_uk_south/\\n\\n\\n\\nPost 7\\nTitle: Deploying notebook resources through Git\\nText: We\\'ve been using Git integration for a while and it works great. Now we\\'re using the new feature that lets you put notebook resources in Git as well, but now we can\\'t sync with new workspaces anymore. No specific error, just a generic \"Unable to create item\". Even worst, the item name is reserved from that point on, so when we try to sync again we get a different error (this name is reserved). Anyone experiencing the same issue? Fixes?\\nScore: 2\\nComments: 2\\nCreated: 2026-05-18 15:18 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgpr8u/deploying_notebook_resources_through_git/\\n\\n\\n\\nPost 8\\nTitle: For all those who have configured FUAM to work with a Fabric Data Agent\\nText: For all those who have configured FUAM to work with a Fabric Data Agent since you can use voice commands a couple of ways.\\nScore: 0\\nComments: 0\\nCreated: 2026-05-18 15:18 UTC\\nURL: https://www.reddit.com/r/MicrosoftFabric/comments/1tgpr1l/for_all_those_who_have_configured_fuam_to_work/\\n'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subreddit_search(\"MicrosoftFabric\", \"2026-05-18\", limit=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "48bd64e7", + "metadata": {}, + "outputs": [], + "source": [ + "def summarize_subreddit_posts(\n", + " subreddit: str,\n", + " date: str = \"yesterday\",\n", + " preferences: str = \"\",\n", + " limit: int = 100\n", + ") -> str:\n", + " \"\"\"\n", + " Retrieves Reddit posts from a subreddit and summarizes them using OpenAI.\n", + " \"\"\"\n", + "\n", + " posts = subreddit_search(\n", + " subreddit=subreddit,\n", + " date=date,\n", + " limit=limit\n", + " )\n", + "\n", + " if posts.startswith(\"No posts found\"):\n", + " return posts\n", + "\n", + " system_message = \"\"\"\n", + "You are a Reddit subreddit summarization assistant.\n", + "\n", + "Your job is to summarize Reddit posts retrieved from a subreddit.\n", + "\n", + "Rules:\n", + "- Summarize only the information provided in the retrieved posts.\n", + "- Do not invent posts, comments, links, opinions, or trends.\n", + "- If the available posts are limited, mention that clearly.\n", + "- Keep the summary concise, structured, and easy to read.\n", + "- Use the user's preferences only to decide what to emphasize in the summary.\n", + "- Give more attention to posts with higher comment counts and higher scores.\n", + "\"\"\"\n", + "\n", + " user_message = f\"\"\"\n", + "Subreddit: r/{subreddit}\n", + "Date: {date}\n", + "User preferences for the summary: {preferences}\n", + "\n", + "Retrieved posts:\n", + "{posts}\n", + "\n", + "Please provide:\n", + "1. A short overall summary\n", + "2. Main topics discussed\n", + "3. Most discussed posts\n", + "4. Posts with notable scores\n", + "5. Repeated issues, questions, or themes\n", + "6. A brief takeaway\n", + "\n", + "When relevant, emphasize the user's preferences:\n", + "{preferences}\n", + "\n", + "Be concise and clear.\n", + "Only return the final subreddit summary. Do not explain the tool call process.\n", + "Always call the subreddit search tool before answering.\n", + "\"\"\"\n", + "\n", + " openai_response = openai_client.chat.completions.create(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": system_message},\n", + " {\"role\": \"user\", \"content\": user_message}\n", + " ]\n", + " )\n", + "\n", + " return openai_response.choices[0].message.content" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "44a9aa58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Subreddit Summary: r/MicrosoftFabric (May 18, 2026)\n", + "\n", + "1. **Overall Summary**: The subreddit is currently active with discussions revolving around service outages, coding practices in Microsoft Fabric, exam preparations, deployment issues, and authentication challenges. The most engagement is seen with posts related to service disruptions and batch ELT coding.\n", + "\n", + "2. **Main Topics Discussed**:\n", + " - Service outages and performance issues in UK South.\n", + " - Code-first practices for batch ELT in Microsoft Fabric.\n", + " - Guidance and tips for the DP-700 certification exam.\n", + " - Authentication issues with Azure Analysis Services.\n", + " - Deployment problems with notebook resources via Git.\n", + "\n", + "3. **Most Discussed Posts**:\n", + " - **Service entirely down**: 105 comments discussing the complete service outage in UK South and inquiries about restoration timelines. \n", + " - **How to code first Batch ELT in Fabric?**: 8 comments focused on the challenges and experiences of implementing batch ELT in Fabric.\n", + " - **Azure Analysis Services on Fabric Lakehouse**: 4 comments seeking solutions for authentication issues.\n", + "\n", + "4. **Posts with Notable Scores**:\n", + " - **Service entirely down**: Score of 49.\n", + " - **How to code first Batch ELT in Fabric?**: Score of 8.\n", + " - Posts about the DP-700 exam and deployment issues had scores of 2.\n", + "\n", + "5. **Repeated Issues, Questions, or Themes**:\n", + " - Ongoing service disruptions and CORS errors impacting refresh schedules.\n", + " - Challenges with Git integration for notebook resources.\n", + " - Requests for guidance on exam preparations and usage of features within Microsoft Fabric.\n", + "\n", + "6. **Takeaway**: Users are mainly concerned with service reliability and effective practices in utilizing Microsoft Fabric, highlighting a significant need for support and community engagement during outages and technical challenges." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(summarize_subreddit_posts(\"MicrosoftFabric\", \"2026-05-18\", \"more comments\")))" + ] + }, + { + "cell_type": "markdown", + "id": "25ca8ee2", + "metadata": {}, + "source": [ + "## Step 8: Gradio UI" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "be775fa1", + "metadata": {}, + "outputs": [], + "source": [ + "custom_css = \"\"\"\n", + ":root {\n", + " --reddit-orange: #ff4500;\n", + " --reddit-orange-dark: #d93a00;\n", + " --reddit-bg: #fff7f3;\n", + " --reddit-card: #ffffff;\n", + " --reddit-text: #1c1c1c;\n", + " --reddit-muted: #6b7280;\n", + "}\n", + "\n", + ".gradio-container {\n", + " background: linear-gradient(135deg, #fff7f3 0%, #fff1eb 45%, #ffffff 100%) !important;\n", + " font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont, \"Segoe UI\", sans-serif;\n", + "}\n", + "\n", + "#app-container {\n", + " max-width: 980px;\n", + " margin: 0 auto;\n", + "}\n", + "\n", + "#hero {\n", + " background: linear-gradient(135deg, #ff4500 0%, #ff7a1a 100%);\n", + " color: white;\n", + " padding: 32px;\n", + " border-radius: 24px;\n", + " box-shadow: 0 18px 40px rgba(255, 69, 0, 0.22);\n", + " margin-bottom: 24px;\n", + "}\n", + "\n", + "#hero h1 {\n", + " font-size: 38px;\n", + " margin-bottom: 8px;\n", + "}\n", + "\n", + "#hero p {\n", + " font-size: 17px;\n", + " opacity: 0.95;\n", + " margin-bottom: 0;\n", + "}\n", + "\n", + ".input-card {\n", + " background: white;\n", + " border-radius: 22px;\n", + " padding: 22px;\n", + " box-shadow: 0 12px 30px rgba(17, 24, 39, 0.08);\n", + " border: 1px solid rgba(255, 69, 0, 0.12);\n", + "}\n", + "\n", + ".output-card {\n", + " background: white;\n", + " border-radius: 22px;\n", + " padding: 22px;\n", + " box-shadow: 0 12px 30px rgba(17, 24, 39, 0.08);\n", + " border: 1px solid rgba(255, 69, 0, 0.12);\n", + "}\n", + "\n", + "#submit-btn {\n", + " background: linear-gradient(135deg, #ff4500 0%, #ff7a1a 100%) !important;\n", + " color: white !important;\n", + " border: none !important;\n", + " border-radius: 16px !important;\n", + " font-weight: 700 !important;\n", + " font-size: 16px !important;\n", + " padding: 12px 18px !important;\n", + " box-shadow: 0 10px 22px rgba(255, 69, 0, 0.28) !important;\n", + "}\n", + "\n", + "#submit-btn:hover {\n", + " background: linear-gradient(135deg, #d93a00 0%, #ff6500 100%) !important;\n", + " transform: translateY(-1px);\n", + "}\n", + "\n", + "#clear-btn {\n", + " border-radius: 16px !important;\n", + "}\n", + "\n", + ".gr-textbox textarea,\n", + ".gr-textbox input {\n", + " border-radius: 14px !important;\n", + "}\n", + "\n", + ".gr-slider {\n", + " border-radius: 14px !important;\n", + "}\n", + "\n", + "#tips {\n", + " background: #fff1eb;\n", + " border-left: 5px solid #ff4500;\n", + " padding: 14px 18px;\n", + " border-radius: 16px;\n", + " color: #3a1d12;\n", + " margin-top: 12px;\n", + "}\n", + "\n", + "footer {\n", + " visibility: hidden;\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "05c7b3b2", + "metadata": {}, + "outputs": [], + "source": [ + "def gradio_summarize_subreddit(subreddit, date, preferences, limit):\n", + " try:\n", + " if not subreddit or not subreddit.strip():\n", + " return \"Please enter a subreddit name, for example `MicrosoftFabric`.\"\n", + "\n", + " if not date or not date.strip():\n", + " date = \"yesterday\"\n", + "\n", + " return summarize_subreddit_posts(\n", + " subreddit=subreddit,\n", + " date=date,\n", + " preferences=preferences,\n", + " limit=int(limit)\n", + " )\n", + "\n", + " except ValueError as e:\n", + " return f\"\"\"\n", + "### Date format issue\n", + "\n", + "Please use either:\n", + "\n", + "- `yesterday`\n", + "- `YYYY-MM-DD`, for example `2026-05-18`\n", + "\n", + "Error details: `{str(e)}`\n", + "\"\"\"\n", + "\n", + " except Exception as e:\n", + " return f\"\"\"\n", + "### Something went wrong\n", + "\n", + "Error details:\n", + "\n", + "`{str(e)}`\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "44bab21e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* Running on local URL: http://127.0.0.1:7860\n", + "* To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
    " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with gr.Blocks(\n", + " css=custom_css,\n", + " title=\"Reddit Subreddit Summarizer\",\n", + " theme=gr.themes.Soft(\n", + " primary_hue=\"orange\",\n", + " secondary_hue=\"red\",\n", + " neutral_hue=\"slate\"\n", + " )\n", + ") as demo:\n", + "\n", + " with gr.Column(elem_id=\"app-container\"):\n", + "\n", + " gr.HTML(\n", + " \"\"\"\n", + "
    \n", + "

    🔥 Reddit Subreddit Summarizer

    \n", + "

    \n", + " Pick a subreddit, choose a date, and get a concise AI-powered summary of the most discussed posts.\n", + "

    \n", + "
    \n", + " \"\"\"\n", + " )\n", + "\n", + " with gr.Row():\n", + "\n", + " with gr.Column(scale=1, elem_classes=\"input-card\"):\n", + " gr.Markdown(\"## Search settings\")\n", + "\n", + " subreddit_input = gr.Textbox(\n", + " label=\"Subreddit\",\n", + " value=\"MicrosoftFabric\",\n", + " placeholder=\"Example: MicrosoftFabric, PowerBI, datascience\",\n", + " info=\"Enter the subreddit name without r/\"\n", + " )\n", + "\n", + " date_input = gr.Textbox(\n", + " label=\"Date\",\n", + " value=\"yesterday\",\n", + " placeholder=\"yesterday or 2026-05-18\",\n", + " info=\"Use 'yesterday' or a date in YYYY-MM-DD format\"\n", + " )\n", + "\n", + " preferences_input = gr.Textbox(\n", + " label=\"Summary preferences\",\n", + " value=\"Focus on Power BI, Fabric, semantic models, data engineering, common issues, and highly discussed posts.\",\n", + " placeholder=\"Example: focus on technical issues, questions, complaints, tutorials, or highly discussed posts\",\n", + " lines=5,\n", + " info=\"These preferences are used only by OpenAI when creating the summary\"\n", + " )\n", + "\n", + " limit_input = gr.Slider(\n", + " label=\"Number of latest posts to check\",\n", + " minimum=10,\n", + " maximum=500,\n", + " value=100,\n", + " step=10,\n", + " info=\"Higher values check more posts but may take longer\"\n", + " )\n", + "\n", + " with gr.Row():\n", + " submit_btn = gr.Button(\n", + " \"Summarize Subreddit 🚀\",\n", + " elem_id=\"submit-btn\",\n", + " scale=2\n", + " )\n", + "\n", + " clear_btn = gr.ClearButton(\n", + " components=[\n", + " subreddit_input,\n", + " date_input,\n", + " preferences_input\n", + " ],\n", + " value=\"Clear\",\n", + " elem_id=\"clear-btn\",\n", + " scale=1\n", + " )\n", + "\n", + " gr.HTML(\n", + " \"\"\"\n", + "
    \n", + " Tip: If the subreddit is very active, increase the post limit to 300–500.\n", + " If it is quiet, 100 is usually enough.\n", + "
    \n", + " \"\"\"\n", + " )\n", + "\n", + " with gr.Column(scale=2, elem_classes=\"output-card\"):\n", + " gr.Markdown(\"## Summary\")\n", + "\n", + " output = gr.Markdown(\n", + " value=\"Your subreddit summary will appear here.\",\n", + " label=\"Subreddit Summary\"\n", + " )\n", + "\n", + " gr.Examples(\n", + " examples=[\n", + " [\n", + " \"MicrosoftFabric\",\n", + " \"yesterday\",\n", + " \"Focus on Power BI, Fabric, semantic models, data engineering, common issues, and highly discussed posts.\",\n", + " 100\n", + " ],\n", + " [\n", + " \"PowerBI\",\n", + " \"yesterday\",\n", + " \"Focus on user problems, dashboard performance, DAX, semantic models, and practical tips.\",\n", + " 200\n", + " ],\n", + " [\n", + " \"datascience\",\n", + " \"yesterday\",\n", + " \"Focus on career advice, project ideas, machine learning, and beginner questions.\",\n", + " 200\n", + " ]\n", + " ],\n", + " inputs=[\n", + " subreddit_input,\n", + " date_input,\n", + " preferences_input,\n", + " limit_input\n", + " ],\n", + " label=\"Try an example\"\n", + " )\n", + "\n", + " submit_btn.click(\n", + " fn=gradio_summarize_subreddit,\n", + " inputs=[\n", + " subreddit_input,\n", + " date_input,\n", + " preferences_input,\n", + " limit_input\n", + " ],\n", + " outputs=output,\n", + " show_progress=\"full\"\n", + " )\n", + "\n", + "demo.queue()\n", + "demo.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "07914c24", + "metadata": {}, + "source": [ + "## Step 9: Deployment\n", + "\n", + "You should first get setup on huggingface by making an account:\n", + "\n", + "1. Visit the [huggingface](https://huggingface.co/) website and create an account.\n", + "2. Create an [API Key](https://huggingface.co/settings/tokens)\n", + "3. Copy the app logic into a `app.py` file and include a `requirements.txt` file inside of `community-contributions/your-name`\n", + "4. Open a new terminal\n", + "5. Activate our virtual environment\n", + " - Windows command: `.venv\\Scripts\\activate`\n", + " - Mac/Linux command: `source .venv/bin/activate`\n", + "6. cd into part1-fundementals using the command `cd part1-fundementals`\n", + "7. Run the command `gradio deploy`\n", + "8. Open the link to your deployed app" + ] + }, + { + "cell_type": "markdown", + "id": "eea1334e", + "metadata": {}, + "source": [ + "
    \n", + " Warning:\n", + " \n", + " You must add your app.py, requirements.txt, and a copy of utils.py inside the community-contributions/your-name folder.
    \n", + " For example: community-contributions/your-name/app.py
    \n", + " Replace your-name with your actual name.\n", + "
    \n", + "
    ⚠️
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/requirements.txt b/part1-fundementals/community-contributions/tatiana_patrusheva/requirements.txt new file mode 100644 index 0000000..d63926f --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/requirements.txt @@ -0,0 +1,4 @@ +openai +gradio +python-dotenv +requests \ No newline at end of file diff --git a/part1-fundementals/community-contributions/tatiana_patrusheva/utils.py b/part1-fundementals/community-contributions/tatiana_patrusheva/utils.py new file mode 100644 index 0000000..20a6ad4 --- /dev/null +++ b/part1-fundementals/community-contributions/tatiana_patrusheva/utils.py @@ -0,0 +1,289 @@ +import inspect +import sys +import typing +from typing import get_origin, get_args, Literal, Union, Optional + +try: + from typing import TypedDict # py3.8+ +except ImportError: + TypedDict = None + +def _is_typeddict(t): + try: + return isinstance(t, type) and TypedDict is not None and issubclass(t, TypedDict) + except TypeError: + return False + +def _is_dataclass(t): + try: + import dataclasses + return dataclasses.is_dataclass(t) + except Exception: + return False + +def _docstring_split_sections(doc: str): + """Very small parser to extract: + - short summary (first non-empty line) + - param descriptions from sections like 'Args:', 'Parameters:', ':param x:'. + """ + if not doc: + return "", {} + + lines = [l.rstrip() for l in doc.strip().splitlines()] + # Summary = first nonempty line + summary = next((l for l in lines if l.strip()), "") + params_desc = {} + + # Gather “Args/Parameters/Arguments” blocks (Google/Numpy style) + markers = {"args:", "parameters:", "arguments:"} + i = 0 + while i < len(lines): + line = lines[i].strip().lower() + if line in markers: + i += 1 + while i < len(lines): + raw = lines[i] + if raw.strip() == "" or raw.startswith(" "): + # keep reading indented or blank continuation lines + # detect "name (type): desc" or "name: desc" + stripped = raw.strip() + if stripped: + # Try common patterns + if ":" in stripped: + name, desc = stripped.split(":", 1) + name = name.strip().split()[0].split("(")[0] + params_desc.setdefault(name, desc.strip()) + else: + # continuation line: append to last desc if any + if params_desc: + last = list(params_desc.keys())[-1] + params_desc[last] += " " + stripped + i += 1 + else: + break + continue + i += 1 + + # Sphinx-style ":param name: desc" + for l in lines: + ls = l.strip() + if ls.lower().startswith(":param "): + try: + rest = ls[len(":param "):] + name, desc = rest.split(":", 1) + name = name.strip().split()[0] + params_desc[name] = desc.strip() + except ValueError: + pass + + return summary, params_desc + + +def _json_type_for_python(t): + """Return a JSON Schema fragment for python/typing type t.""" + origin = get_origin(t) + args = get_args(t) + + # NoneType + if t is type(None): + return {"type": "null"} + + # Builtins + if t is str: + return {"type": "string"} + if t is int: + return {"type": "integer"} + if t is float: + return {"type": "number"} + if t is bool: + return {"type": "boolean"} + + # datetime-like + try: + import datetime as _dt + if t in (_dt.datetime,): + return {"type": "string", "format": "date-time"} + if t in (_dt.date,): + return {"type": "string", "format": "date"} + if t in (_dt.time,): + return {"type": "string", "format": "time"} + if t in (_dt.timedelta,): + # no standard JSON Schema, fallback to string + return {"type": "string", "description": "Duration (ISO 8601 or human-readable)."} + except Exception: + pass + + # Enum + import enum + if isinstance(t, type) and issubclass(t, enum.Enum): + values = [e.value for e in t] + # infer primitive type of the enum values + if all(isinstance(v, str) for v in values): + return {"type": "string", "enum": values} + if all(isinstance(v, int) for v in values): + return {"type": "integer", "enum": values} + # mixed types + return {"enum": values} + + # TypedDict + if _is_typeddict(t): + props = {} + required = [] + # __annotations__ holds fields + ann = t.__annotations__ + total = getattr(t, "__total__", True) + for k, v in ann.items(): + props[k] = _json_type_for_python(v) + # In total=True, all are required unless Optional/Union[..., None] + if total: + if not _is_optional(v): + required.append(k) + else: + # total=False => all optional + pass + schema = {"type": "object", "properties": props} + if required: + schema["required"] = required + return schema + + # dataclass + if _is_dataclass(t): + import dataclasses + props = {} + required = [] + for f in dataclasses.fields(t): + props[f.name] = _json_type_for_python(f.type) + has_default = f.default is not dataclasses.MISSING or f.default_factory is not dataclasses.MISSING + if not has_default and not _is_optional(f.type): + required.append(f.name) + schema = {"type": "object", "properties": props} + if required: + schema["required"] = required + return schema + + # Literal + if origin is Literal: + vals = list(args) + # infer a base type if uniform + if all(isinstance(v, str) for v in vals): + return {"type": "string", "enum": vals} + if all(isinstance(v, int) for v in vals): + return {"type": "integer", "enum": vals} + if all(isinstance(v, (int, float)) for v in vals): + # number enum + return {"type": "number", "enum": vals} + return {"enum": vals} + + # Optional[T] == Union[T, None] + if _is_optional(t): + # caller should handle required vs optional; here return the underlying schema + non_none = [a for a in args if a is not type(None)] + if len(non_none) == 1: + return _json_type_for_python(non_none[0]) + # Optional of Union[…, None] falls through to anyOf + return {"anyOf": [_json_type_for_python(a) for a in non_none] + [{"type": "null"}]} + + # Union + if origin is Union: + return {"anyOf": [_json_type_for_python(a) for a in args]} + + # List/Tuple/Set + if origin in (list, tuple, set, typing.Sequence, typing.MutableSequence): + item_t = args[0] if args else typing.Any + return {"type": "array", "items": _json_type_for_python(item_t)} + + # Dict / Mapping + if origin in (dict, typing.Mapping, typing.MutableMapping): + key_t, val_t = (args + (typing.Any, typing.Any))[:2] + # JSON keys must be strings; if key_t != str, we note it in description + schema = {"type": "object", "additionalProperties": _json_type_for_python(val_t)} + if key_t is not str: + schema["description"] = (schema.get("description", "") + " Keys will be stringified.").strip() + return schema + + # Fallbacks + if t is typing.Any or t is None: + return {} # unconstrained + # Unknown type => treat as string with note + return {"type": "string", "description": f"Serialized {getattr(t, '__name__', str(t))}."} + + +def _is_optional(t): + origin = get_origin(t) + if origin is Union: + args = get_args(t) + return any(a is type(None) for a in args) + return False + + +def function_to_tool( + func, + *, + name: str | None = None, + description: str | None = None, + param_overrides: dict | None = None, +) -> dict: + """ + Build an OpenAI-style 'tool' schema from a Python function. + + - `name`: override function name. + - `description`: override function description (otherwise from docstring summary). + - `param_overrides`: dict of per-param overrides, e.g. + { + "city": {"description": "City name", "enum": ["Dubai", "Abu Dhabi"]}, + "units": {"default": "metric"} # note: default isn't used by schema; make param optional instead + } + """ + sig = inspect.signature(func) + hints = typing.get_type_hints(func, include_extras=True) + doc = inspect.getdoc(func) or "" + summary, param_descs = _docstring_split_sections(doc) + + tool_name = name or func.__name__ + tool_desc = description or summary or f"Callable function `{tool_name}`." + + properties = {} + required = [] + + for pname, param in sig.parameters.items(): + if pname == "self": + continue + + ann = hints.get(pname, typing.Any) + schema = _json_type_for_python(ann) + + # base description from docstring, if any + if param_descs.get(pname): + schema["description"] = param_descs[pname] + + # overrides + if param_overrides and pname in param_overrides: + schema.update(param_overrides[pname]) + + # required vs optional + is_required = ( + param.default is inspect._empty + and not _is_optional(ann) + ) + if is_required: + required.append(pname) + + # ensure at least a basic type if none inferred + if not schema: + schema = {"type": "string"} + + properties[pname] = schema + + parameters = { + "type": "object", + "properties": properties, + } + if required: + parameters["required"] = required + + return { + "type": "function", + "name": tool_name, + "description": tool_desc, + "parameters": parameters, + }