update notebook for generating unit tests

Rangeet Pan · Rangeet Pan · commit f01075d5356d · 2024-08-27T11:15:36.000-04:00
diff --git a/docs/examples/java/code_summarization.ipynb b/docs/examples/java/code_summarization.ipynb
@@ -29,7 +29,6 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "import os\n",
     "from pathlib import Path\n",
     "import ollama\n",
     "from cldk import CLDK\n",
@@ -122,9 +121,8 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "if __name__ == \"__main__\":\n",
-    "    # Create a new instance of the CLDK class\n",
-    "    cldk = CLDK(language=\"java\")"
+    "# Create a new instance of the CLDK class\n",
+    "cldk = CLDK(language=\"java\")"
    ],
    "metadata": {
     "collapsed": false
@@ -149,8 +147,8 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "    # Create an analysis object over the java application\n",
-    "    analysis = cldk.analysis(project_path=\"JAVA_APP_PATH\", analysis_level=AnalysisLevel.symbol_table)"
+    "# Create an analysis object over the java application\n",
+    "analysis = cldk.analysis(project_path=\"JAVA_APP_PATH\", analysis_level=AnalysisLevel.symbol_table)"
    ],
    "metadata": {
     "collapsed": false
@@ -194,40 +192,39 @@
    "execution_count": null,
    "outputs": [],
    "source": [
-    "\n",
-    "    # Iterate over all the files in the project\n",
-    "    for file_path, class_file in analysis.get_symbol_table().items():\n",
-    "        class_file_path = Path(file_path).absolute().resolve()\n",
-    "        # Iterate over all the classes in the file\n",
-    "        for type_name, type_declaration in class_file.type_declarations.items():\n",
-    "            # Iterate over all the methods in the class\n",
-    "            for method in type_declaration.callable_declarations.values():\n",
-    "                # Get code body of the method\n",
-    "                code_body = class_file_path.read_text()\n",
-    "\n",
-    "                # Initialize the treesitter utils for the class file content\n",
-    "                tree_sitter_utils = cldk.tree_sitter_utils(source_code=code_body)\n",
-    "\n",
-    "                # Sanitize the class for analysis\n",
-    "                sanitized_class = tree_sitter_utils.sanitize_focal_class(method.declaration)\n",
-    "\n",
-    "                # Format the instruction for the given focal method and class\n",
-    "                instruction = format_inst(\n",
-    "                    code=sanitized_class,\n",
-    "                    focal_method=method.declaration,\n",
-    "                    focal_class=type_name,\n",
-    "                    language=\"java\"\n",
-    "                )\n",
-    "\n",
-    "                # Prompt the local model on Ollama\n",
-    "                llm_output = prompt_ollama(\n",
-    "                    message=instruction,\n",
-    "                    model_id=\"granite-code:20b-instruct\",\n",
-    "                )\n",
-    "\n",
-    "                # Print the instruction and LLM output\n",
-    "                print(f\"Instruction:\\n{instruction}\")\n",
-    "                print(f\"LLM Output:\\n{llm_output}\")"
+    "# Iterate over all the files in the project\n",
+    "for file_path, class_file in analysis.get_symbol_table().items():\n",
+    "    class_file_path = Path(file_path).absolute().resolve()\n",
+    "    # Iterate over all the classes in the file\n",
+    "    for type_name, type_declaration in class_file.type_declarations.items():\n",
+    "        # Iterate over all the methods in the class\n",
+    "        for method in type_declaration.callable_declarations.values():\n",
+    "            # Get code body of the method\n",
+    "            code_body = class_file_path.read_text()\n",
+    "    \n",
+    "            # Initialize the treesitter utils for the class file content\n",
+    "            tree_sitter_utils = cldk.tree_sitter_utils(source_code=code_body)\n",
+    "    \n",
+    "            # Sanitize the class for analysis\n",
+    "            sanitized_class = tree_sitter_utils.sanitize_focal_class(method.declaration)\n",
+    "    \n",
+    "            # Format the instruction for the given focal method and class\n",
+    "            instruction = format_inst(\n",
+    "                code=sanitized_class,\n",
+    "                focal_method=method.declaration,\n",
+    "                focal_class=type_name,\n",
+    "                language=\"java\"\n",
+    "            )\n",
+    "    \n",
+    "            # Prompt the local model on Ollama\n",
+    "            llm_output = prompt_ollama(\n",
+    "                message=instruction,\n",
+    "                model_id=\"granite-code:20b-instruct\",\n",
+    "            )\n",
+    "    \n",
+    "            # Print the instruction and LLM output\n",
+    "            print(f\"Instruction:\\n{instruction}\")\n",
+    "            print(f\"LLM Output:\\n{llm_output}\")"
    ],
    "metadata": {
     "collapsed": false
diff --git a/docs/examples/java/generate_unit_tests.ipynb b/docs/examples/java/generate_unit_tests.ipynb
@@ -1,14 +1,155 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Generating unit tests for code is a very tedious task and often takes a significant effort from the developers to write good test cases. There are various tools that are available for automated test generation, such as, EvoSuite, which uses evolutionary algorithm to generate test cases. However, the test cases that are being generated are not natural and often developers do not prefer to add them to their test suite. Whereas, Large Language Models (LLM) being trained with developer-written code, it has better affinity towards generating more natural code--more readable, maintainable code. In this excercise, we will show we can leverage LLMs to generate test cases with the help of CLDK. \n",
+    "\n",
+    "For simplicity, we will cover certain aspects of test generation and provide some context information to LLM for better quality of test cases. In this excercise, we will generate unit test for non-private method from a Java class and provide the focal method body and the signature of all the constructors of the class so that LLM can understand how to create object of the focal class during the setup phase of the tests. Also, we will ask LLMs to generate ```N``` number of test cases, where ```N``` is the cyclomatic complexity of the focal method. The intuition is that one test may not be sufficient for covering fairly complex method and cyclomatic complexity score can provide some guidance towards that. \n",
+    "\n",
+    "(Step 1) First, we will import all the neccessary libraries"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "5856baff4aa64ed7"
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "initial_id",
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import ollama\n",
+    "from cldk import CLDK\n",
+    "from cldk.analysis import AnalysisLevel"
+   ],
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
+   "id": "b3d2498ae092fcc"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Step 2) Second, we will form the prompt for the model, which will include all the constructor signarures, and the body of the focal method."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "67eb24b29826d730"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "outputs": [],
-   "source": []
+   "source": [
+    "def format_inst(focal_method_body, focal_method, focal_class, constructor_signatures, cyclomatic_complexity, language):\n",
+    "    \"\"\"\n",
+    "    Format the instruction for the given focal method and class.\n",
+    "    \"\"\"\n",
+    "    inst = f\"Question: Can you generate {cyclomatic_complexity} unit tests for the method `{focal_method}` in the class `{focal_class}` below?\\n\"\n",
+    "\n",
+    "    inst += \"\\n\"\n",
+    "    inst += f\"```{language}\\n\"\n",
+    "    inst += \"```\\n\"\n",
+    "    inst += \"public class {focal_class} {\"\n",
+    "    inst += f\"<|constructors|>\\n{constructor_signatures}\\n<|constructors|>\\n\"\n",
+    "    inst += f\"<|focal method|>\\n {focal_method_body} \\n <|focal method|>\\n\" \n",
+    "    inst += \"}\"\n",
+    "    inst += \"```\\n\"\n",
+    "    inst += \"Answer:\\n\"\n",
+    "    return inst"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "d7bc9bbaa917df24"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Step 3) Third, use ollama to call LLM (in case Granite 8b)."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ae9ceb150f5efa92"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "def prompt_ollama(message: str, model_id: str = \"granite-code:8b-instruct\") -> str:\n",
+    "    \"\"\"Prompt local model on Ollama\"\"\"\n",
+    "    response_object = ollama.generate(model=model_id, prompt=message)\n",
+    "    return response_object[\"response\"]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "52634feae7374599"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "(Step 3) Third, collect all the information needed for each method. "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "308c3325116b87d4"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Create a new instance of the CLDK class\n",
+    "cldk = CLDK(language=\"java\")\n",
+    "# Create an analysis object over the java application. Provide the application path using JAVA_APP_PATH\n",
+    "analysis = cldk.analysis(project_path=\"JAVA_APP_PATH\", analysis_level=AnalysisLevel.symbol_table)\n",
+    "# Go through all the classes in the application\n",
+    "for class_name in analysis.get_classes():\n",
+    "    class_details  = analysis.get_class(qualified_class_name=class_name)\n",
+    "    # Generate test cases for non-interface and non-abstract classes\n",
+    "    if not class_details.is_interface and 'abstract' not in class_details.modifiers:\n",
+    "        # Get all constructor signatures\n",
+    "        constructor_signatures = ''\n",
+    "        for method in analysis.get_methods_in_class(qualified_class_name=class_name):\n",
+    "            method_details = analysis.get_method(qualified_class_name=class_name, qualified_method_name=method)\n",
+    "            if method_details.is_constructor:\n",
+    "                constructor_signatures += method_details.signature + '\\n'\n",
+    "        # If no constructor present, then add the signature of the default constructor\n",
+    "        if constructor_signatures=='':\n",
+    "            constructor_signatures = f'public {class_name} ()'\n",
+    "        # Go through all the methods in the class\n",
+    "        for method in analysis.get_methods_in_class(qualified_class_name=class_name):\n",
+    "            # Get the method details\n",
+    "            method_details = analysis.get_method(qualified_class_name=class_name, qualified_method_name=method)\n",
+    "            # Generate test cases for non-private methods\n",
+    "            if 'private' not in method_details.modifiers and not method_details.is_constructor:\n",
+    "                # Gather all the information needed for the prompt, which are focal method body, focal method name, focal class name, constructor signature, and cyclomatic complexity\n",
+    "                prompt = format_inst(focal_method_body=method_details.code,\n",
+    "                                     focal_method=method,\n",
+    "                                     focal_class=class_name,\n",
+    "                                     constructor_signatures=constructor_signatures,\n",
+    "                                     cyclomatic_complexity=method_details.cyclomatic_complexity)\n",
+    "                # Prompt the local model on Ollama\n",
+    "                llm_output = prompt_ollama(\n",
+    "                    message=prompt,\n",
+    "                    model_id=\"granite-code:20b-instruct\",\n",
+    "                )\n",
+    "        \n",
+    "                # Print the instruction and LLM output\n",
+    "                print(f\"Instruction:\\n{prompt}\")\n",
+    "                print(f\"LLM Output:\\n{llm_output}\")"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "65c9558e4de65a52"
   }
  ],
  "metadata": {