From 12b7979d4914ca0a540ad758210f1027f26a2bb5 Mon Sep 17 00:00:00 2001 From: Ruben Alvarez Date: Wed, 12 Jun 2024 16:57:27 -0700 Subject: [PATCH 1/3] gpt-4 model is now supported in mapping table, raises ValueError if model not supported and lists supported models --- .gitignore | 1 + examples/notebooks/sdk_quick_tutorial.ipynb | 1055 ++++++------------- sidekick/query.py | 5 + sidekick/utils.py | 4 +- 4 files changed, 338 insertions(+), 727 deletions(-) diff --git a/.gitignore b/.gitignore index c757ddf..ca58397 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ examples/demo .sidekickvenv models/ +db/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/examples/notebooks/sdk_quick_tutorial.ipynb b/examples/notebooks/sdk_quick_tutorial.ipynb index de888d8..2f0f713 100644 --- a/examples/notebooks/sdk_quick_tutorial.ipynb +++ b/examples/notebooks/sdk_quick_tutorial.ipynb @@ -2,18 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "id": "60080b7e-2e80-4154-aa35-87c13b6ab371", "metadata": {}, "outputs": [], "source": [ "# https://github.com/h2oai/sql-sidekick/releases\n", - "#!python3 -m pip install --force-reinstall sql_sidekick-0.2.2-py3-none-any.whl" + "# !python -m pip uninstall sql_sidekick-0.2.4-py3-none-any.whl -y" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "f480e37a-4327-48da-8c84-aba0ac1eef23", "metadata": {}, "outputs": [], @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "c91887ce-c74a-432b-a3f9-120c8abc0003", "metadata": {}, "outputs": [], @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "9fc212c8-dc73-4330-a07f-7394fd198395", "metadata": {}, "outputs": [], @@ -51,18 +51,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "6421a995-f846-4a1e-8292-374bd7500382", "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "f = pd.read_csv(\"./sleep_health_and_lifestyle_dataset.csv\")" + "# import pandas as pd\n", + "# f = pd.read_csv(\"../demo/demo_data.csv\")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "eac0fa65-bb06-415a-aa87-1185789f878d", "metadata": {}, "outputs": [], @@ -80,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "62e23b39-caa8-4e2f-bf12-678dd586f0df", "metadata": {}, "outputs": [ @@ -98,8 +98,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:35:06.568\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36m_extract_schema_info\u001b[0m:\u001b[36m162\u001b[0m - \u001b[34m\u001b[1mUsing schema information from: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:06.572\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m186\u001b[0m - \u001b[34m\u001b[1mSchema info used for creating table:\n", + "\u001b[32m2024-06-12 16:44:00.837\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36m_extract_schema_info\u001b[0m:\u001b[36m162\u001b[0m - \u001b[34m\u001b[1mUsing schema information from: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:00.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m186\u001b[0m - \u001b[34m\u001b[1mSchema info used for creating table:\n", " Person_ID NUMERIC,\n", "Gender TEXT COLLATE NOCASE,\n", "Age NUMERIC,\n", @@ -113,7 +113,7 @@ "Heart_Rate NUMERIC,\n", "Daily_Steps NUMERIC,\n", "Sleep_Disorder TEXT COLLATE NOCASE\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:06.578\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m198\u001b[0m - \u001b[1mTable created: sleep_health_eda\u001b[0m\n" + "\u001b[32m2024-06-12 16:44:00.839\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m198\u001b[0m - \u001b[1mTable created: sleep_health_eda\u001b[0m\n" ] }, { @@ -127,17 +127,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:35:06.586\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m222\u001b[0m - \u001b[34m\u001b[1mAdding sample values to table: ./sleep_health_and_lifestyle_dataset.csv\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:06.597\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m228\u001b[0m - \u001b[34m\u001b[1mInserting chunk: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:06.755\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m233\u001b[0m - \u001b[1mData inserted into table: sleep_health_eda\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:06.759\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m238\u001b[0m - \u001b[1mNumber of rows inserted: 2618\u001b[0m\n" + "\u001b[32m2024-06-12 16:44:00.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m222\u001b[0m - \u001b[34m\u001b[1mAdding sample values to table: examples/demo/demo_data.csv\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:00.843\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m228\u001b[0m - \u001b[34m\u001b[1mInserting chunk: 0\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:01.024\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m233\u001b[0m - \u001b[1mData inserted into table: sleep_health_eda\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:01.025\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m238\u001b[0m - \u001b[1mNumber of rows inserted: 4114\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Created a Database querydb. Inserted sample values from ./sleep_health_and_lifestyle_dataset.csv into table sleep_health_eda, please ask questions!\n" + "Created a Database querydb. Inserted sample values from examples/demo/demo_data.csv into table sleep_health_eda, please ask questions!\n" ] } ], @@ -151,7 +151,7 @@ "\n", "# Given .csv file, auto-generate schema\n", "# Download dataset --> https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset\n", - "data_path = \"./sleep_health_and_lifestyle_dataset.csv\"\n", + "data_path = \"examples/demo/demo_data.csv\"\n", "table_name = \"sleep_health_eda\"\n", "\n", "r, table_info_path = generate_schema(data_path=data_path, output_path=f\"{cache_path}/{table_name}_table_info.jsonl\")\n", @@ -171,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "80dec22c-362e-41a0-8f34-0690465542e6", "metadata": {}, "outputs": [ @@ -182,14 +182,16 @@ " 'h2ogpt-sql-sqlcoder-34b-alpha-4bit',\n", " 'h2ogpt-sql-nsql-llama-2-7B-4bit',\n", " 'h2ogpt-sql-sqlcoder2',\n", + " 'h2ogpt-sql-sqlcoder-7b-2',\n", " 'h2ogpt-sql-sqlcoder-34b-alpha',\n", " 'h2ogpt-sql-nsql-llama-2-7B',\n", " 'gpt-3.5-turbo',\n", " 'gpt-4-8k',\n", - " 'gpt-4-1106-preview-128k']" + " 'gpt-4-1106-preview-128k',\n", + " 'gpt-4']" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -201,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "2b3db015-1d9e-46b0-ad58-2f5aac0c6e4c", "metadata": {}, "outputs": [], @@ -225,7 +227,7 @@ " sample_queries_path=sample_qna_path,\n", " table_name=table_name,\n", " is_command=False,\n", - " model_name=\"h2ogpt-sql-sqlcoder2-4bit\", #Other default model option: h2ogpt-sql-sqlcoder-34b-alpha\n", + " model_name=\"gpt-4o\", #Other default model option: h2ogpt-sql-sqlcoder-34b-alpha\n", " is_regenerate=regenerate,\n", " is_regen_with_options=regenerate_with_options,\n", " execute_query=False,\n", @@ -236,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "523f1a88-eea8-414c-89b1-b7a2b3126535", "metadata": {}, "outputs": [ @@ -244,280 +246,67 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:35:33.226\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m500\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m501\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.232\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m535\u001b[0m - \u001b[1mQuestion: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.234\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m553\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.235\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:33.236\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m359\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.049\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[1mTotal Memory: 23GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.055\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mFree GPU memory: 20GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.057\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m83\u001b[0m - \u001b[1mLoading local model: h2ogpt-sql-sqlcoder2-4bit\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_causal_lm_model\u001b[0m:\u001b[36m382\u001b[0m - \u001b[1mTotal GPUs: 1\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.059\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_load_llm\u001b[0m:\u001b[36m390\u001b[0m - \u001b[1mFree GPU memory: 20GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.060\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_load_llm\u001b[0m:\u001b[36m393\u001b[0m - \u001b[1mLoading model: defog/sqlcoder2 on device id: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.062\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_load_llm\u001b[0m:\u001b[36m394\u001b[0m - \u001b[34m\u001b[1mModel cache: .//models/\u001b[0m\n", - "\u001b[32m2024-01-27 20:35:35.063\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_load_llm\u001b[0m:\u001b[36m432\u001b[0m - \u001b[34m\u001b[1mLoading in 4 bit mode: True with device {'': 0}\u001b[0m\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "dec7435d27704941a96dcdb9951ed10e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/4 [00:00> or ->\n", - "- Use prepared statements with parameterized queries to prevent SQL injection\n", "\n", - "\n", - "### Input:\n", - "For SQL TABLE 'sleep_health_eda' with sample question/answer pairs,\n", - "(), create a valid SQL (dialect:sqlite) query to answer the following question:\n", - "What is the average sleep duration for each gender?.\n", - "This query will run on a database whose schema is represented in this string:\n", - "CREATE TABLE 'sleep_health_eda' (['Person_ID NUMERIC, Gender TEXT, Age NUMERIC, Occupation TEXT, Sleep_Duration NUMERIC, Quality_of_Sleep NUMERIC, Physical_Activity_Level NUMERIC, Stress_Level NUMERIC, BMI_Category TEXT, Blood_Pressure TEXT, Heart_Rate NUMERIC, Daily_Steps NUMERIC, Sleep_Disorder TEXT,']\n", - ");\n", - "\n", - "-- Table 'sleep_health_eda', , has sample values ({'sleep_health_eda': [\"'Gender' contains values similar to Male,Female.\", \"'Occupation' contains values similar to Lawyer,Teacher,Doctor,Software Engineer,Scientist,Sales Representative,Accountant,Salesperson,Manager,Nurse.\", \"'BMI_Category' contains values similar to Overweight,Normal,Obese,Normal Weight.\", \"'Sleep_Disorder' contains values similar to None,Sleep Apnea,Insomnia.\"]})\n", - "\n", - "### Response:\n", - "Based on your instructions, here is the SELECT SQL query I have generated to answer the question `What is the average sleep duration for each gender?`:\n", - "```SELECT\u001b[0m\n", - "\u001b[32m2024-01-27 20:36:22.461\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m590\u001b[0m - \u001b[1mContext length: 743\u001b[0m\n", - "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n", - "\u001b[32m2024-01-27 20:36:30.891\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m608\u001b[0m - \u001b[1mInput query: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-01-27 20:36:30.895\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m609\u001b[0m - \u001b[1mGenerated response:\n", - "\n", - "SELECT \"gender\", AVG(\"sleep_duration\") AS \"average_sleep_duration\" FROM \"sleep_health_eda\" GROUP BY \"gender\" LIMIT 100\u001b[0m\n", - "\u001b[32m2024-01-27 20:36:30.905\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m635\u001b[0m - \u001b[1mAlternate responses:\n", - "\n", - "[]\u001b[0m\n" + "\u001b[32m2024-06-12 16:44:02.338\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:02.343\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", + "\u001b[32m2024-06-12 16:44:02.343\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n" ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exiting...\n" + "ename": "ValueError", + "evalue": "Invalid model name gpt-4o. Available models: dict_keys(['h2ogpt-sql-sqlcoder2-4bit', 'h2ogpt-sql-sqlcoder-34b-alpha-4bit', 'h2ogpt-sql-nsql-llama-2-7B-4bit', 'h2ogpt-sql-sqlcoder2', 'h2ogpt-sql-sqlcoder-7b-2', 'h2ogpt-sql-sqlcoder-34b-alpha', 'h2ogpt-sql-nsql-llama-2-7B', 'gpt-3.5-turbo', 'gpt-4-8k', 'gpt-4-1106-preview-128k', 'gpt-4'])", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is the average sleep duration for each gender?\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msleep_health_eda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_info_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_info_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_qna_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[9], line 14\u001b[0m, in \u001b[0;36mquery\u001b[0;34m(question, table_name, table_info_path, sample_qna_path, regenerate, regenerate_with_options)\u001b[0m\n\u001b[1;32m 12\u001b[0m base_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# self_correction is enabled by default, set to False if not needed.\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquestion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_info_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_info_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_queries_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_qna_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_command\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgpt-4o\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m#Other default model option: h2ogpt-sql-sqlcoder-34b-alpha\u001b[39;49;00m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_regenerate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mregenerate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_regen_with_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mregenerate_with_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecute_query\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_base_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", + "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/prompter.py:578\u001b[0m, in \u001b[0;36mask\u001b[0;34m(question, table_info_path, sample_queries_path, table_name, model_name, db_dialect, execute_db_dialect, is_regenerate, is_regen_with_options, is_command, debug_mode, execute_query, guardrails, self_correction, local_base_path)\u001b[0m\n\u001b[1;32m 564\u001b[0m sql_g \u001b[38;5;241m=\u001b[39m SQLGenerator(\n\u001b[1;32m 565\u001b[0m db_url\u001b[38;5;241m=\u001b[39mdb_url,\n\u001b[1;32m 566\u001b[0m openai_key\u001b[38;5;241m=\u001b[39mapi_key,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 575\u001b[0m remote_model\u001b[38;5;241m=\u001b[39m_remote_model\n\u001b[1;32m 576\u001b[0m )\n\u001b[1;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh2ogpt-sql\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m model_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _execute_sql(question):\n\u001b[0;32m--> 578\u001b[0m sql_g\u001b[38;5;241m.\u001b[39m_tasks \u001b[38;5;241m=\u001b[39m \u001b[43msql_g\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_tasks\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtable_names\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m results\u001b[38;5;241m.\u001b[39mextend([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI am thinking step by step: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, sql_g\u001b[38;5;241m.\u001b[39m_tasks, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 580\u001b[0m click\u001b[38;5;241m.\u001b[39mecho(sql_g\u001b[38;5;241m.\u001b[39m_tasks)\n", + "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:391\u001b[0m, in \u001b[0;36mSQLGenerator.generate_tasks\u001b[0;34m(self, table_names, input_question)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m task_list\n\u001b[1;32m 390\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m se:\n\u001b[0;32m--> 391\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m se\n", + "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:386\u001b[0m, in \u001b[0;36mSQLGenerator.generate_tasks\u001b[0;34m(self, table_names, input_question)\u001b[0m\n\u001b[1;32m 384\u001b[0m data_info \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(data)\n\u001b[1;32m 385\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_info \u001b[38;5;241m=\u001b[39m data_info\n\u001b[0;32m--> 386\u001b[0m task_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_tasks\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_question\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_info\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_queries\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_names\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/var/lib/tmp/data/tasks.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 388\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(task_list)\n", + "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:239\u001b[0m, in \u001b[0;36mSQLGenerator._query_tasks\u001b[0;34m(self, question_str, data_info, sample_queries, table_name)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ve:\n\u001b[0;32m--> 239\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ve\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m se:\n\u001b[1;32m 241\u001b[0m _, ex_value, _ \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n", + "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:227\u001b[0m, in \u001b[0;36mSQLGenerator._query_tasks\u001b[0;34m(self, question_str, data_info, sample_queries, table_name)\u001b[0m\n\u001b[1;32m 225\u001b[0m m_name \u001b[38;5;241m=\u001b[39m MODEL_CHOICE_MAP\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_name)\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m m_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid model name \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Available models: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mMODEL_CHOICE_MAP\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 229\u001b[0m completion \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopenai_client\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m 230\u001b[0m model\u001b[38;5;241m=\u001b[39mm_name,\n\u001b[1;32m 231\u001b[0m messages\u001b[38;5;241m=\u001b[39mquery_txt,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 234\u001b[0m temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m\n\u001b[1;32m 235\u001b[0m )\n\u001b[1;32m 236\u001b[0m res \u001b[38;5;241m=\u001b[39m completion\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent\n", + "\u001b[0;31mValueError\u001b[0m: Invalid model name gpt-4o. Available models: dict_keys(['h2ogpt-sql-sqlcoder2-4bit', 'h2ogpt-sql-sqlcoder-34b-alpha-4bit', 'h2ogpt-sql-nsql-llama-2-7B-4bit', 'h2ogpt-sql-sqlcoder2', 'h2ogpt-sql-sqlcoder-7b-2', 'h2ogpt-sql-sqlcoder-34b-alpha', 'h2ogpt-sql-nsql-llama-2-7B', 'gpt-3.5-turbo', 'gpt-4-8k', 'gpt-4-1106-preview-128k', 'gpt-4'])" ] } ], "source": [ - "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\", \n", + "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\",\n", " table_info_path=table_info_path, sample_qna_path=None)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "b17e2b4f-8736-4d44-addc-db8d2be4ce51", "metadata": {}, "outputs": [ @@ -525,19 +314,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "Question = **Generated response for question,**\n", - "What is the average sleep duration for each gender?\n", - "\n", - "----\n", - "Generated SQL = ``` sql\n", - "SELECT \"gender\",\n", - " AVG(\"sleep_duration\") AS \"average_sleep_duration\"\n", - "FROM \"sleep_health_eda\"\n", - "GROUP BY \"gender\"\n", - "LIMIT 100\n", - "```\n", - "\n", - "\n" + "Question = Something went wrong while generating response. Please check the supplied API Keys and try again.\n", + "----\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQuestion = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m----\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGenerated SQL = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" ] } ], @@ -549,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "03c5dfc0-c6f0-4573-b36d-56dc7bcbe8bc", "metadata": {}, "outputs": [ @@ -557,116 +346,143 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:39:50.016\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m500\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.017\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m501\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.018\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.019\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m535\u001b[0m - \u001b[1mQuestion: What are the most common occupations among individuals in the dataset?\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.020\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m553\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.021\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.022\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m359\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.023\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[1mTotal Memory: 23GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.024\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mFree GPU memory: 8GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.038\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.039\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m603\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.043\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m155\u001b[0m - \u001b[34m\u001b[1mInput questions: # query: what are the most common occupations among individuals in the dataset?\u001b[0m\n" + "\u001b[32m2024-06-12 16:27:02.898\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m504\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mOpenAI key found.\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.900\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m538\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.900\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m539\u001b[0m - \u001b[1mQuestion: What are the most common occupations among individuals in the dataset?\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.901\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m557\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.901\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m358\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.902\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.902\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mTotal Memory: 7GB\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.903\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m366\u001b[0m - \u001b[1mFree GPU memory: 6GB\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:02.903\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_embedding_model\u001b[0m:\u001b[36m103\u001b[0m - \u001b[34m\u001b[1mLoading embedding model from: .//models/sentence_transformers\u001b[0m\n", + "Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 212511.40it/s]" ] }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e4589b85d3514f2ea3c88a505f15698c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/1 [00:00>>>>>>>>>>>>>> gpt-4\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", + "2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", + "3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", + "4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n" + ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:39:50.139\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: if patterns like 'current time' or 'now' occurs in question: 0.8284876985286928\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.141\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: if patterns like 'total number', or 'list' occurs in question: 0.8591431101102107\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.143\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: detailed summary: 0.8650206131706182\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.146\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: summary: 0.8724867083448907\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.147\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m184\u001b[0m - \u001b[34m\u001b[1mSorted context: []\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.148\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m486\u001b[0m - \u001b[34m\u001b[1mFilter Context: []\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.149\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m494\u001b[0m - \u001b[1mFiltering Question/Query pairs ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.150\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m496\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.151\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m516\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.152\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mre_rank\u001b[0m:\u001b[36m138\u001b[0m - \u001b[34m\u001b[1mQuestion length: 11\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.153\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m561\u001b[0m - \u001b[34m\u001b[1mRelevant sample column values: {'sleep_health_eda': [\"'Gender' contains values similar to Male,Female.\", \"'Occupation' contains values similar to Lawyer,Teacher,Doctor,Software Engineer,Scientist,Sales Representative,Accountant,Salesperson,Manager,Nurse.\", \"'BMI_Category' contains values similar to Overweight,Normal,Obese,Normal Weight.\", \"'Sleep_Disorder' contains values similar to None,Sleep Apnea,Insomnia.\"]}\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.155\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m578\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", + "\u001b[32m2024-06-12 16:27:09.104\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m607\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:09.105\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m427\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", " \n", - "### Instructions:\n", - "Your task is convert a question into a valid sqlite syntax SQL query, given a sqlite database schema. Let's work this out step by step to be sure we have the right answer.\n", - "Only use the column names from the CREATE TABLE statement.\n", - "Adhere to these rules:\n", - "- **Deliberately go through the question and database schema word by word** to appropriately answer the question\n", - "- **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.\n", - "- Only use supplied table names: **sleep_health_eda** for generation\n", - "- Only use column names from the CREATE TABLE statement: **['Person_ID NUMERIC, Gender TEXT, Age NUMERIC, Occupation TEXT, Sleep_Duration NUMERIC, Quality_of_Sleep NUMERIC, Physical_Activity_Level NUMERIC, Stress_Level NUMERIC, BMI_Category TEXT, Blood_Pressure TEXT, Heart_Rate NUMERIC, Daily_Steps NUMERIC, Sleep_Disorder TEXT,']** for generation. DO NOT USE any other column names outside of this.\n", - "- Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", - "- Avoid patterns that might be vulnerable to SQL injection, e.g. use proper sanitization and escaping for raw user input\n", - "- Always cast the numerator as float when computing ratios\n", - "- Always use COUNT(1) instead of COUNT(*)\n", - "- If the question is asking for a rate, use COUNT to compute percentage\n", - "- Avoid using the WITH statement\n", - "- DO NOT USE aggregate and window function together\n", - "- Prefer NOT EXISTS to LEFT JOIN ON null id\n", - "- When using DESC keep NULLs at the end\n", - "- If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", - "- Use prepared statements with parameterized queries to prevent SQL injection\n", - "\n", - "\n", - "### Input:\n", - "For SQL TABLE 'sleep_health_eda' with sample question/answer pairs,\n", - "(), create a valid SQL (dialect:sqlite) query to answer the following question:\n", - "What are the most common occupations among individuals in the dataset?.\n", - "This query will run on a database whose schema is represented in this string:\n", - "CREATE TABLE 'sleep_health_eda' (['Person_ID NUMERIC, Gender TEXT, Age NUMERIC, Occupation TEXT, Sleep_Duration NUMERIC, Quality_of_Sleep NUMERIC, Physical_Activity_Level NUMERIC, Stress_Level NUMERIC, BMI_Category TEXT, Blood_Pressure TEXT, Heart_Rate NUMERIC, Daily_Steps NUMERIC, Sleep_Disorder TEXT,']\n", - ");\n", - "\n", - "-- Table 'sleep_health_eda', , has sample values ({'sleep_health_eda': [\"'Gender' contains values similar to Male,Female.\", \"'Occupation' contains values similar to Lawyer,Teacher,Doctor,Software Engineer,Scientist,Sales Representative,Accountant,Salesperson,Manager,Nurse.\", \"'BMI_Category' contains values similar to Overweight,Normal,Obese,Normal Weight.\", \"'Sleep_Disorder' contains values similar to None,Sleep Apnea,Insomnia.\"]})\n", - "\n", - "### Response:\n", - "Based on your instructions, here is the SELECT SQL query I have generated to answer the question `What are the most common occupations among individuals in the dataset?`:\n", - "```SELECT\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.161\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m590\u001b[0m - \u001b[1mContext length: 749\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.162\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m651\u001b[0m - \u001b[1mRegeneration requested on previous query ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:50.163\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m652\u001b[0m - \u001b[34m\u001b[1mSelected temperature for fast regeneration : 0.8\u001b[0m\n", - "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n", - "\u001b[32m2024-01-27 20:39:52.499\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m675\u001b[0m - \u001b[34m\u001b[1mTemperature saved: 0.8\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:52.512\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m608\u001b[0m - \u001b[1mInput query: What are the most common occupations among individuals in the dataset?\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:52.513\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m609\u001b[0m - \u001b[1mGenerated response:\n", + " ### System: Act as a SQL Expert\n", + " # For table ['sleep_health_eda'], given an input *Question*, only generate syntactically correct sqlite SQL queries.\n", + " # Let's work it out in a detailed step by step way using the reasoning from *Tasks* section.\n", + " # Pick the SQL query which has the highest average log probability if more than one result is likely to answer the\n", + " candidate *Question*.\n", + " ### sqlite SQL tables\n", + " ### *Data:* \n", + "For table ['sleep_health_eda'] schema info is mentioned below,\n", + "\n", + "{\"Column Name\": \"Person_ID\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Gender\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Male\", \"Female\"]}\n", + "{\"Column Name\": \"Age\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Occupation\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Nurse\", \"Salesperson\", \"Manager\", \"Engineer\", \"Software Engineer\", \"Sales Representative\", \"Scientist\", \"Doctor\", \"Teacher\", \"Accountant\"]}\n", + "{\"Column Name\": \"Sleep_Duration\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Quality_of_Sleep\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Physical_Activity_Level\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Stress_Level\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"BMI_Category\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Overweight\", \"Normal\", \"Obese\", \"Normal Weight\"]}\n", + "{\"Column Name\": \"Blood_Pressure\", \"Column Type\": \"TEXT\"}\n", + "{\"Column Name\": \"Heart_Rate\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Daily_Steps\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Sleep_Disorder\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"None\", \"Sleep Apnea\", \"Insomnia\"]}\n", + " ### *History*:\n", + "\n", + " ### *Question*: For table ['sleep_health_eda'], What are the most common occupations among individuals in the dataset?\n", + " # SELECT 1\n", + " ### *Plan for table ['sleep_health_eda']*:\n", + "# 1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", + "# 2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", + "# 3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", + "# 4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n", + " ### *Policies for SQL generation*:\n", + " # Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", + " # Avoid patterns that might be vulnerable to SQL injection\n", + " # Use values and column names that are explicitly mentioned in the question or in the *Data* section.\n", + " # DO NOT query for columns that do not exist\n", + " # Validate column names with the table name when needed\n", + " # DO NOT USE aggregate and window function together\n", + " # Use COUNT(1) instead of COUNT(*)\n", + " # Return with LIMIT 100\n", + " # Prefer NOT EXISTS to LEFT JOIN ON null id\n", + " # Avoid using the WITH statement\n", + " # When using DESC keep NULLs at the end\n", + " # Always cast the numerator as float when computing ratios\n", + " # If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", + " # Use prepared statements with parameterized queries to prevent SQL injection\n", + " # Add explanation and reasoning for each SQL query\n", + " \u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:llama_index.service_context:chunk_size_limit is deprecated, please specify chunk_size instead\n", + "chunk_size_limit is deprecated, please specify chunk_size instead\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:llama_index.indices.struct_store.sql_query:> Table desc str: Schema of table sleep_health_eda:\n", + "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", + "\n", + "> Table desc str: Schema of table sleep_health_eda:\n", + "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", + "\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-06-12 16:27:16.212\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m612\u001b[0m - \u001b[1mInput query: What are the most common occupations among individuals in the dataset?\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.213\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m613\u001b[0m - \u001b[1mGenerated response:\n", "\n", - "SELECT \"occupation\", COUNT(1) AS \"COUNT\" FROM \"sleep_health_eda\" GROUP BY \"occupation\" ORDER BY \"COUNT\" DESC LIMIT 100\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:52.516\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m635\u001b[0m - \u001b[1mAlternate responses:\n", + "SELECT \"Occupation\", COUNT(1) AS \"Count\" FROM \"sleep_health_eda\" GROUP BY \"Occupation\" ORDER BY \"Count\" DESC LIMIT 100\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m639\u001b[0m - \u001b[1mAlternate responses:\n", "\n", "[]\u001b[0m\n" ] @@ -680,15 +496,15 @@ } ], "source": [ - "# On using re-generation flag we toggle the temperature values between 0 and 1 alternating between low \n", + "# On using re-generation flag we toggle the temperature values between 0 and 1 alternating between low\n", "# (focus/conservative generation and high values (random/creative generation)\n", - "res = query(\"What are the most common occupations among individuals in the dataset?\", table_name=\"sleep_health_eda\", \n", + "res = query(\"What are the most common occupations among individuals in the dataset?\", table_name=\"sleep_health_eda\",\n", " table_info_path=table_info_path, sample_qna_path=None, regenerate=True)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "cf2fc33d-ea21-4ab2-9019-329f5bc2051d", "metadata": {}, "outputs": [ @@ -696,20 +512,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Question = **Generated response for question,**\n", - "What are the most common occupations among individuals in the dataset?\n", + "Question = I am thinking step by step: \n", "\n", "----\n", - "Generated SQL = ``` sql\n", - "SELECT \"occupation\",\n", - " COUNT(1) AS \"COUNT\"\n", - "FROM \"sleep_health_eda\"\n", - "GROUP BY \"occupation\"\n", - "ORDER BY \"COUNT\" DESC\n", - "LIMIT 100\n", - "```\n", - "\n", - "\n" + "Generated SQL = 1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", + "2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", + "3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", + "4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n" ] } ], @@ -721,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "b47bef8d-c991-4581-a7fc-23a056911c3f", "metadata": {}, "outputs": [ @@ -729,238 +538,145 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:39:56.595\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m500\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.597\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m501\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.598\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.599\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m535\u001b[0m - \u001b[1mQuestion: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.601\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m553\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.602\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m355\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.604\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m359\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.605\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[1mTotal Memory: 23GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.607\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mFree GPU memory: 8GB\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.629\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m469\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.631\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m603\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.640\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m155\u001b[0m - \u001b[34m\u001b[1mInput questions: # query: what is the average sleep duration for each gender?\u001b[0m\n" + "\u001b[32m2024-06-12 16:27:16.225\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m504\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.226\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mOpenAI key found.\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m538\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.228\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m539\u001b[0m - \u001b[1mQuestion: What is the average sleep duration for each gender?\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.228\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m557\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m358\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.229\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.230\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mTotal Memory: 7GB\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m366\u001b[0m - \u001b[1mFree GPU memory: 5GB\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:16.232\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_embedding_model\u001b[0m:\u001b[36m103\u001b[0m - \u001b[34m\u001b[1mLoading embedding model from: .//models/sentence_transformers\u001b[0m\n", + "Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 189742.32it/s]" ] }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ed8c4c529c54952a30bfb4d99b7ec95", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/1 [00:00>>>>>>>>>>>>>> gpt-4\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "1. Identify the relevant columns in the question. In this case, we are interested in 'Gender' and 'Sleep_Duration'.\n", + "2. Aggregate the data by 'Gender' to split the data into different groups according to gender.\n", + "3. For each 'Gender' group, calculate the average 'Sleep_Duration'.\n", + "4. Return the 'Gender' along with the corresponding average 'Sleep_Duration' for each gender.\n" + ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-01-27 20:39:56.752\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: if patterns like 'current time' or 'now' occurs in question: 0.8459207869447033\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.755\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: if patterns like 'total number', or 'list' occurs in question: 0.8319947353454415\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.759\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: detailed summary: 0.8346069603076574\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.763\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m168\u001b[0m - \u001b[34m\u001b[1mSimilarity score for: summary: 0.8394152180082535\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.764\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36msemantic_search\u001b[0m:\u001b[36m184\u001b[0m - \u001b[34m\u001b[1mSorted context: []\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.766\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m486\u001b[0m - \u001b[34m\u001b[1mFilter Context: []\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.767\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m494\u001b[0m - \u001b[1mFiltering Question/Query pairs ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.769\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m496\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.770\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m516\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.775\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mre_rank\u001b[0m:\u001b[36m138\u001b[0m - \u001b[34m\u001b[1mQuestion length: 9\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.776\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m561\u001b[0m - \u001b[34m\u001b[1mRelevant sample column values: {'sleep_health_eda': [\"'Gender' contains values similar to Male,Female.\", \"'Occupation' contains values similar to Lawyer,Teacher,Doctor,Software Engineer,Scientist,Sales Representative,Accountant,Salesperson,Manager,Nurse.\", \"'BMI_Category' contains values similar to Overweight,Normal,Obese,Normal Weight.\", \"'Sleep_Disorder' contains values similar to None,Sleep Apnea,Insomnia.\"]}\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.778\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m578\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", + "\u001b[32m2024-06-12 16:27:20.448\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m607\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:20.450\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m427\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", " \n", - "### Instructions:\n", - "Your task is convert a question into a valid sqlite syntax SQL query, given a sqlite database schema. Let's work this out step by step to be sure we have the right answer.\n", - "Only use the column names from the CREATE TABLE statement.\n", - "Adhere to these rules:\n", - "- **Deliberately go through the question and database schema word by word** to appropriately answer the question\n", - "- **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.\n", - "- Only use supplied table names: **sleep_health_eda** for generation\n", - "- Only use column names from the CREATE TABLE statement: **['Person_ID NUMERIC, Gender TEXT, Age NUMERIC, Occupation TEXT, Sleep_Duration NUMERIC, Quality_of_Sleep NUMERIC, Physical_Activity_Level NUMERIC, Stress_Level NUMERIC, BMI_Category TEXT, Blood_Pressure TEXT, Heart_Rate NUMERIC, Daily_Steps NUMERIC, Sleep_Disorder TEXT,']** for generation. DO NOT USE any other column names outside of this.\n", - "- Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", - "- Avoid patterns that might be vulnerable to SQL injection, e.g. use proper sanitization and escaping for raw user input\n", - "- Always cast the numerator as float when computing ratios\n", - "- Always use COUNT(1) instead of COUNT(*)\n", - "- If the question is asking for a rate, use COUNT to compute percentage\n", - "- Avoid using the WITH statement\n", - "- DO NOT USE aggregate and window function together\n", - "- Prefer NOT EXISTS to LEFT JOIN ON null id\n", - "- When using DESC keep NULLs at the end\n", - "- If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", - "- Use prepared statements with parameterized queries to prevent SQL injection\n", - "\n", - "\n", - "### Input:\n", - "For SQL TABLE 'sleep_health_eda' with sample question/answer pairs,\n", - "(), create a valid SQL (dialect:sqlite) query to answer the following question:\n", - "What is the average sleep duration for each gender?.\n", - "This query will run on a database whose schema is represented in this string:\n", - "CREATE TABLE 'sleep_health_eda' (['Person_ID NUMERIC, Gender TEXT, Age NUMERIC, Occupation TEXT, Sleep_Duration NUMERIC, Quality_of_Sleep NUMERIC, Physical_Activity_Level NUMERIC, Stress_Level NUMERIC, BMI_Category TEXT, Blood_Pressure TEXT, Heart_Rate NUMERIC, Daily_Steps NUMERIC, Sleep_Disorder TEXT,']\n", - ");\n", - "\n", - "-- Table 'sleep_health_eda', , has sample values ({'sleep_health_eda': [\"'Gender' contains values similar to Male,Female.\", \"'Occupation' contains values similar to Lawyer,Teacher,Doctor,Software Engineer,Scientist,Sales Representative,Accountant,Salesperson,Manager,Nurse.\", \"'BMI_Category' contains values similar to Overweight,Normal,Obese,Normal Weight.\", \"'Sleep_Disorder' contains values similar to None,Sleep Apnea,Insomnia.\"]})\n", - "\n", - "### Response:\n", - "Based on your instructions, here is the SELECT SQL query I have generated to answer the question `What is the average sleep duration for each gender?`:\n", - "```SELECT\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.785\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m590\u001b[0m - \u001b[1mContext length: 743\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.787\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m677\u001b[0m - \u001b[1mRegeneration with options requested on previous query ...\u001b[0m\n", - "\u001b[32m2024-01-27 20:39:56.788\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m692\u001b[0m - \u001b[34m\u001b[1mSelected temperature for diverse beam search: 0.4\u001b[0m\n", - "/home/pramit/.jupyterven/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:392: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.4` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", - " warnings.warn(\n", - "/home/pramit/.jupyterven/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:407: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `5` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.\n", - " warnings.warn(\n", - "Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.\n", - "\u001b[32m2024-01-27 20:42:01.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m727\u001b[0m - \u001b[1mGenerated options:\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.440\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m748\u001b[0m - \u001b[1mAlternate options:\n", - "Option 1: (_probability_: 0.381034255027771)\n", - "``` sql\n", - "SELECT gender,\n", - " AVG(sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.444\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m748\u001b[0m - \u001b[1mAlternate options:\n", - "Option 2: (_probability_: 0.2624567449092865)\n", - "``` sql\n", - "SELECT AVG(sleep_duration) AS average_sleep_duration,\n", - " gender\n", - "FROM sleep_health_eda\n", - "GROUP BY gender\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.446\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m748\u001b[0m - \u001b[1mAlternate options:\n", - "Option 3: (_probability_: 0.22498156130313873)\n", - "``` sql\n", - "SELECT Gender,\n", - " AVG(Sleep_Duration) AS average_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY Gender\n", - "ORDER BY average_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.451\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m748\u001b[0m - \u001b[1mAlternate options:\n", - "Option 4: (_probability_: 0.13085876405239105)\n", - "``` sql\n", - "SELECT 'Gender',\n", - " AVG('Sleep_Duration') AS average_sleep_duration\n", - "FROM'sleep_health_eda'\n", - "GROUP BY 'Gender'\n", - "ORDER BY average_sleep_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.474\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m748\u001b[0m - \u001b[1mAlternate options:\n", - "Option 5: (_probability_: 0.0006686743581667542)\n", - "``` sql\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.488\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m608\u001b[0m - \u001b[1mInput query: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m609\u001b[0m - \u001b[1mGenerated response:\n", + " ### System: Act as a SQL Expert\n", + " # For table ['sleep_health_eda'], given an input *Question*, only generate syntactically correct sqlite SQL queries.\n", + " # Let's work it out in a detailed step by step way using the reasoning from *Tasks* section.\n", + " # Pick the SQL query which has the highest average log probability if more than one result is likely to answer the\n", + " candidate *Question*.\n", + " ### sqlite SQL tables\n", + " ### *Data:* \n", + "For table ['sleep_health_eda'] schema info is mentioned below,\n", + "\n", + "{\"Column Name\": \"Person_ID\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Gender\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Male\", \"Female\"]}\n", + "{\"Column Name\": \"Age\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Occupation\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Nurse\", \"Salesperson\", \"Manager\", \"Engineer\", \"Software Engineer\", \"Sales Representative\", \"Scientist\", \"Doctor\", \"Teacher\", \"Accountant\"]}\n", + "{\"Column Name\": \"Sleep_Duration\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Quality_of_Sleep\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Physical_Activity_Level\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Stress_Level\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"BMI_Category\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Overweight\", \"Normal\", \"Obese\", \"Normal Weight\"]}\n", + "{\"Column Name\": \"Blood_Pressure\", \"Column Type\": \"TEXT\"}\n", + "{\"Column Name\": \"Heart_Rate\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Daily_Steps\", \"Column Type\": \"NUMERIC\"}\n", + "{\"Column Name\": \"Sleep_Disorder\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"None\", \"Sleep Apnea\", \"Insomnia\"]}\n", + " ### *History*:\n", + "\n", + " ### *Question*: For table ['sleep_health_eda'], What is the average sleep duration for each gender?\n", + " # SELECT 1\n", + " ### *Plan for table ['sleep_health_eda']*:\n", + "# 1. Identify the relevant columns in the question. In this case, we are interested in 'Gender' and 'Sleep_Duration'.\n", + "# 2. Aggregate the data by 'Gender' to split the data into different groups according to gender.\n", + "# 3. For each 'Gender' group, calculate the average 'Sleep_Duration'.\n", + "# 4. Return the 'Gender' along with the corresponding average 'Sleep_Duration' for each gender.\n", + " ### *Policies for SQL generation*:\n", + " # Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", + " # Avoid patterns that might be vulnerable to SQL injection\n", + " # Use values and column names that are explicitly mentioned in the question or in the *Data* section.\n", + " # DO NOT query for columns that do not exist\n", + " # Validate column names with the table name when needed\n", + " # DO NOT USE aggregate and window function together\n", + " # Use COUNT(1) instead of COUNT(*)\n", + " # Return with LIMIT 100\n", + " # Prefer NOT EXISTS to LEFT JOIN ON null id\n", + " # Avoid using the WITH statement\n", + " # When using DESC keep NULLs at the end\n", + " # Always cast the numerator as float when computing ratios\n", + " # If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", + " # Use prepared statements with parameterized queries to prevent SQL injection\n", + " # Add explanation and reasoning for each SQL query\n", + " \u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:llama_index.service_context:chunk_size_limit is deprecated, please specify chunk_size instead\n", + "chunk_size_limit is deprecated, please specify chunk_size instead\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:llama_index.indices.struct_store.sql_query:> Table desc str: Schema of table sleep_health_eda:\n", + "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", + "\n", + "> Table desc str: Schema of table sleep_health_eda:\n", + "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", + "\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-06-12 16:27:23.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m612\u001b[0m - \u001b[1mInput query: What is the average sleep duration for each gender?\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:23.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m613\u001b[0m - \u001b[1mGenerated response:\n", "\n", - "SELECT \"gender\", AVG(\"sleep_duration\") AS \"average_sleep_duration\" FROM \"sleep_health_eda\" GROUP BY \"gender\" ORDER BY \"average_sleep_duration\" DESC LIMIT 100\u001b[0m\n", - "\u001b[32m2024-01-27 20:42:01.492\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m635\u001b[0m - \u001b[1mAlternate responses:\n", + "SELECT \"Gender\", AVG(\"Sleep_Duration\") AS \"Avg_Sleep_Duration\" FROM \"sleep_health_eda\" GROUP BY \"Gender\" ORDER BY \"Avg_Sleep_Duration\" DESC\u001b[0m\n", + "\u001b[32m2024-06-12 16:27:23.840\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m639\u001b[0m - \u001b[1mAlternate responses:\n", "\n", - "['Option 1: (_probability_: 0.381034255027771)\\n``` sql\\nSELECT gender,\\n AVG(sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY gender\\nORDER BY average_sleep_duration DESC NULLS LAST\\nLIMIT 100;\\n```\\n\\n\\n', 'Option 2: (_probability_: 0.2624567449092865)\\n``` sql\\nSELECT AVG(sleep_duration) AS average_sleep_duration,\\n gender\\nFROM sleep_health_eda\\nGROUP BY gender\\nLIMIT 100;\\n```\\n\\n\\n', 'Option 3: (_probability_: 0.22498156130313873)\\n``` sql\\nSELECT Gender,\\n AVG(Sleep_Duration) AS average_duration\\nFROM sleep_health_eda\\nGROUP BY Gender\\nORDER BY average_duration DESC NULLS LAST\\nLIMIT 100;\\n```\\n\\n\\n', \"Option 4: (_probability_: 0.13085876405239105)\\n``` sql\\nSELECT 'Gender',\\n AVG('Sleep_Duration') AS average_sleep_duration\\nFROM'sleep_health_eda'\\nGROUP BY 'Gender'\\nORDER BY average_sleep_duration DESC NULLS LAST\\nLIMIT 100;\\n```\\n\\n\\n\", 'Option 5: (_probability_: 0.0006686743581667542)\\n``` sql\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\\nFROM sleep_health_eda\\nGROUP BY sleep_health_eda.gender\\nORDER BY average_sleep_duration DESC NULLS LAST;\\n\\nSELECT sleep_health_eda.gender,\\n AVG(sleep_health_eda\\nLIMIT 100;\\n```\\n\\n\\n']\u001b[0m\n" + "[]\u001b[0m\n" ] }, { @@ -973,13 +689,13 @@ ], "source": [ "# Alternate options\n", - "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\", \n", + "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\",\n", " table_info_path=table_info_path, sample_qna_path=None, regenerate_with_options=True)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "id": "e415c0b9-466e-4417-ac1e-493914a83c36", "metadata": {}, "outputs": [ @@ -987,120 +703,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Question = **Generated response for question,**\n", - "What is the average sleep duration for each gender?\n", - "\n", - "----Options----\n", - "Option 1: (_probability_: 0.381034255027771)\n", - "``` sql\n", - "SELECT gender,\n", - " AVG(sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\n", - "Option 2: (_probability_: 0.2624567449092865)\n", - "``` sql\n", - "SELECT AVG(sleep_duration) AS average_sleep_duration,\n", - " gender\n", - "FROM sleep_health_eda\n", - "GROUP BY gender\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\n", - "Option 3: (_probability_: 0.22498156130313873)\n", - "``` sql\n", - "SELECT Gender,\n", - " AVG(Sleep_Duration) AS average_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY Gender\n", - "ORDER BY average_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\n", - "Option 4: (_probability_: 0.13085876405239105)\n", - "``` sql\n", - "SELECT 'Gender',\n", - " AVG('Sleep_Duration') AS average_sleep_duration\n", - "FROM'sleep_health_eda'\n", - "GROUP BY 'Gender'\n", - "ORDER BY average_sleep_duration DESC NULLS LAST\n", - "LIMIT 100;\n", - "```\n", - "\n", - "\n", - "\n", - "Option 5: (_probability_: 0.0006686743581667542)\n", - "``` sql\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda.sleep_duration) AS average_sleep_duration\n", - "FROM sleep_health_eda\n", - "GROUP BY sleep_health_eda.gender\n", - "ORDER BY average_sleep_duration DESC NULLS LAST;\n", - "\n", - "SELECT sleep_health_eda.gender,\n", - " AVG(sleep_health_eda\n", - "LIMIT 100;\n", - "```\n", - "\n", + "Question = I am thinking step by step: \n", "\n", - "\n" + "----Options----\n" ] } ], @@ -1136,7 +741,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.10.0" } }, "nbformat": 4, diff --git a/sidekick/query.py b/sidekick/query.py index 94974ce..379ddef 100644 --- a/sidekick/query.py +++ b/sidekick/query.py @@ -223,6 +223,8 @@ def _query_tasks(self, question_str, data_info, sample_queries, table_name: list MODEL_CHOICE_MAP = MODEL_CHOICE_MAP_EVAL_MODE m_name = MODEL_CHOICE_MAP.get(self.model_name) + if m_name is None: + raise ValueError(f"Invalid model name {self.model_name}. Available models: {MODEL_CHOICE_MAP.keys()}") completion = self.openai_client.chat.completions.create( model=m_name, @@ -233,6 +235,8 @@ def _query_tasks(self, question_str, data_info, sample_queries, table_name: list ) res = completion.choices[0].message.content return res + except ValueError as ve: + raise ve except Exception as se: _, ex_value, _ = sys.exc_info() res = ex_value.statement if ex_value.statement else None @@ -531,6 +535,7 @@ def generate_sql( else context_queries ) logger.info(f"Number of possible contextual queries to question: {len(filtered_context)}") + logger.info(f"HEEEEEEEEERE22: {filtered_context}") # If QnA pairs > 5, we keep top 5 for focused context # Most relevant match is closest to the generation post re-ranking _samples = filtered_context diff --git a/sidekick/utils.py b/sidekick/utils.py index 148133b..ecf205c 100644 --- a/sidekick/utils.py +++ b/sidekick/utils.py @@ -39,8 +39,8 @@ "h2ogpt-sql-nsql-llama-2-7B": "NumbersStation/nsql-llama-2-7B", "gpt-3.5-turbo": "gpt-3.5-turbo-1106", "gpt-4-8k": "gpt-4", - "gpt-4-1106-preview-128k": "gpt-4-1106-preview" - + "gpt-4-1106-preview-128k": "gpt-4-1106-preview", + "gpt-4": "gpt-4", } MODEL_CHOICE_MAP_DEFAULT = { From 97e6405ad7b021b6a816d3c94a7cf449d61e2535 Mon Sep 17 00:00:00 2001 From: Ruben Alvarez Date: Wed, 12 Jun 2024 16:59:38 -0700 Subject: [PATCH 2/3] removed unnecessary log msg --- sidekick/query.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sidekick/query.py b/sidekick/query.py index 379ddef..9eace8f 100644 --- a/sidekick/query.py +++ b/sidekick/query.py @@ -535,7 +535,6 @@ def generate_sql( else context_queries ) logger.info(f"Number of possible contextual queries to question: {len(filtered_context)}") - logger.info(f"HEEEEEEEEERE22: {filtered_context}") # If QnA pairs > 5, we keep top 5 for focused context # Most relevant match is closest to the generation post re-ranking _samples = filtered_context From 1d8ca4a70ac19cf2e2c2ce7089455b604e52d2c5 Mon Sep 17 00:00:00 2001 From: Ruben Alvarez Date: Thu, 13 Jun 2024 17:55:57 -0700 Subject: [PATCH 3/3] added support for openai models; default behavior is now via openai's; added loggers to improve debugging process --- .gitignore | 1 + examples/notebooks/sdk_quick_tutorial.ipynb | 513 +------------------- sidekick/query.py | 9 +- sidekick/utils.py | 33 +- 4 files changed, 52 insertions(+), 504 deletions(-) diff --git a/.gitignore b/.gitignore index ca58397..01a1c90 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ examples/demo .sidekickvenv models/ db/ +sdk_quick_tutorial.ipynb # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/examples/notebooks/sdk_quick_tutorial.ipynb b/examples/notebooks/sdk_quick_tutorial.ipynb index 2f0f713..0fb7fbe 100644 --- a/examples/notebooks/sdk_quick_tutorial.ipynb +++ b/examples/notebooks/sdk_quick_tutorial.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "60080b7e-2e80-4154-aa35-87c13b6ab371", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "f480e37a-4327-48da-8c84-aba0ac1eef23", "metadata": {}, "outputs": [], @@ -71,11 +71,13 @@ "import os\n", "\n", "os.environ['OPENAI_API_KEY'] = \"\"\n", - "os.environ['H2OGPT_URL'] = 'http://38.128.233.247'\n", + "os.environ['H2OGPT_URL'] = \"\"\n", "os.environ['H2OGPT_API_TOKEN'] = \"\"\n", "# To get access to h2ogpte endpoint, reach out to cloud-feedback@h2o.ai\n", - "os.environ['H2OGPTE_URL'] = \"https://h2ogpte.genai.h2o.ai\" # e.g. https://<>.h2ogpte.h2o.ai\n", - "os.environ['H2OGPTE_API_TOKEN'] = \"\"" + "os.environ['H2OGPTE_URL'] = \"\" # e.g. https://<>.h2ogpte.h2o.ai\n", + "os.environ['H2OGPTE_API_TOKEN'] = \"\"\n", + "\n", + "os.environ['H2OGPT_BASE_URL'] = \"\"" ] }, { @@ -83,64 +85,7 @@ "execution_count": null, "id": "62e23b39-caa8-4e2f-bf12-678dd586f0df", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Information supplied:\n", - " querydb, localhost, sqlite, abc, 5432\n", - "Database already exists!\n", - "Table name: sleep_health_eda\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:44:00.837\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36m_extract_schema_info\u001b[0m:\u001b[36m162\u001b[0m - \u001b[34m\u001b[1mUsing schema information from: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:00.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m186\u001b[0m - \u001b[34m\u001b[1mSchema info used for creating table:\n", - " Person_ID NUMERIC,\n", - "Gender TEXT COLLATE NOCASE,\n", - "Age NUMERIC,\n", - "Occupation TEXT COLLATE NOCASE,\n", - "Sleep_Duration NUMERIC,\n", - "Quality_of_Sleep NUMERIC,\n", - "Physical_Activity_Level NUMERIC,\n", - "Stress_Level NUMERIC,\n", - "BMI_Category TEXT COLLATE NOCASE,\n", - "Blood_Pressure TEXT COLLATE NOCASE,\n", - "Heart_Rate NUMERIC,\n", - "Daily_Steps NUMERIC,\n", - "Sleep_Disorder TEXT COLLATE NOCASE\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:00.839\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36mcreate_table\u001b[0m:\u001b[36m198\u001b[0m - \u001b[1mTable created: sleep_health_eda\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Checked table sleep_health_eda exists in the DB.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:44:00.841\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m222\u001b[0m - \u001b[34m\u001b[1mAdding sample values to table: examples/demo/demo_data.csv\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:00.843\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m228\u001b[0m - \u001b[34m\u001b[1mInserting chunk: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.024\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m233\u001b[0m - \u001b[1mData inserted into table: sleep_health_eda\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.025\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.db_config\u001b[0m:\u001b[36madd_samples\u001b[0m:\u001b[36m238\u001b[0m - \u001b[1mNumber of rows inserted: 4114\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Created a Database querydb. Inserted sample values from examples/demo/demo_data.csv into table sleep_health_eda, please ask questions!\n" - ] - } - ], + "outputs": [], "source": [ "HOST_NAME = \"localhost\"\n", "USER_NAME = \"sqlite\"\n", @@ -174,28 +119,7 @@ "execution_count": null, "id": "80dec22c-362e-41a0-8f34-0690465542e6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['h2ogpt-sql-sqlcoder2-4bit',\n", - " 'h2ogpt-sql-sqlcoder-34b-alpha-4bit',\n", - " 'h2ogpt-sql-nsql-llama-2-7B-4bit',\n", - " 'h2ogpt-sql-sqlcoder2',\n", - " 'h2ogpt-sql-sqlcoder-7b-2',\n", - " 'h2ogpt-sql-sqlcoder-34b-alpha',\n", - " 'h2ogpt-sql-nsql-llama-2-7B',\n", - " 'gpt-3.5-turbo',\n", - " 'gpt-4-8k',\n", - " 'gpt-4-1106-preview-128k',\n", - " 'gpt-4']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# List supported models\n", "list_models()" @@ -241,64 +165,7 @@ "execution_count": null, "id": "523f1a88-eea8-414c-89b1-b7a2b3126535", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:44:01.044\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m504\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.045\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.045\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mOpenAI key found.\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.046\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m538\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.046\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m539\u001b[0m - \u001b[1mQuestion: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.047\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m557\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.048\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m358\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.048\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.744\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mTotal Memory: 7GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.745\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m366\u001b[0m - \u001b[1mFree GPU memory: 6GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.745\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m__new__\u001b[0m:\u001b[36m83\u001b[0m - \u001b[1mLoading local model: gpt-4o\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_causal_lm_model\u001b[0m:\u001b[36m385\u001b[0m - \u001b[1mTotal GPUs: 1\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.746\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_causal_lm_model\u001b[0m:\u001b[36m466\u001b[0m - \u001b[1mAn error occurred while loading the model: 'gpt-4o'\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:01.747\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_embedding_model\u001b[0m:\u001b[36m103\u001b[0m - \u001b[34m\u001b[1mLoading embedding model from: .//models/sentence_transformers\u001b[0m\n", - "Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 225755.74it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n", - "Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[32m2024-06-12 16:44:02.338\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:02.343\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m354\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:44:02.343\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m373\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Invalid model name gpt-4o. Available models: dict_keys(['h2ogpt-sql-sqlcoder2-4bit', 'h2ogpt-sql-sqlcoder-34b-alpha-4bit', 'h2ogpt-sql-nsql-llama-2-7B-4bit', 'h2ogpt-sql-sqlcoder2', 'h2ogpt-sql-sqlcoder-7b-2', 'h2ogpt-sql-sqlcoder-34b-alpha', 'h2ogpt-sql-nsql-llama-2-7B', 'gpt-3.5-turbo', 'gpt-4-8k', 'gpt-4-1106-preview-128k', 'gpt-4'])", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mquery\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mWhat is the average sleep duration for each gender?\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msleep_health_eda\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_info_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_info_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_qna_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[9], line 14\u001b[0m, in \u001b[0;36mquery\u001b[0;34m(question, table_name, table_info_path, sample_qna_path, regenerate, regenerate_with_options)\u001b[0m\n\u001b[1;32m 12\u001b[0m base_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m./\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# self_correction is enabled by default, set to False if not needed.\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquestion\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_info_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_info_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 17\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_queries_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_qna_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_command\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgpt-4o\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m#Other default model option: h2ogpt-sql-sqlcoder-34b-alpha\u001b[39;49;00m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_regenerate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mregenerate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_regen_with_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mregenerate_with_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m \u001b[49m\u001b[43mexecute_query\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_base_path\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 25\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", - "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/prompter.py:578\u001b[0m, in \u001b[0;36mask\u001b[0;34m(question, table_info_path, sample_queries_path, table_name, model_name, db_dialect, execute_db_dialect, is_regenerate, is_regen_with_options, is_command, debug_mode, execute_query, guardrails, self_correction, local_base_path)\u001b[0m\n\u001b[1;32m 564\u001b[0m sql_g \u001b[38;5;241m=\u001b[39m SQLGenerator(\n\u001b[1;32m 565\u001b[0m db_url\u001b[38;5;241m=\u001b[39mdb_url,\n\u001b[1;32m 566\u001b[0m openai_key\u001b[38;5;241m=\u001b[39mapi_key,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 575\u001b[0m remote_model\u001b[38;5;241m=\u001b[39m_remote_model\n\u001b[1;32m 576\u001b[0m )\n\u001b[1;32m 577\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mh2ogpt-sql\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m model_name \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m _execute_sql(question):\n\u001b[0;32m--> 578\u001b[0m sql_g\u001b[38;5;241m.\u001b[39m_tasks \u001b[38;5;241m=\u001b[39m \u001b[43msql_g\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_tasks\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtable_names\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mquestion\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m results\u001b[38;5;241m.\u001b[39mextend([\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mI am thinking step by step: \u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, sql_g\u001b[38;5;241m.\u001b[39m_tasks, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 580\u001b[0m click\u001b[38;5;241m.\u001b[39mecho(sql_g\u001b[38;5;241m.\u001b[39m_tasks)\n", - "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:391\u001b[0m, in \u001b[0;36mSQLGenerator.generate_tasks\u001b[0;34m(self, table_names, input_question)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m task_list\n\u001b[1;32m 390\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m se:\n\u001b[0;32m--> 391\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m se\n", - "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:386\u001b[0m, in \u001b[0;36mSQLGenerator.generate_tasks\u001b[0;34m(self, table_names, input_question)\u001b[0m\n\u001b[1;32m 384\u001b[0m data_info \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(data)\n\u001b[1;32m 385\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_data_info \u001b[38;5;241m=\u001b[39m data_info\n\u001b[0;32m--> 386\u001b[0m task_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_tasks\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_question\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_info\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_queries\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtable_names\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/var/lib/tmp/data/tasks.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 388\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(task_list)\n", - "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:239\u001b[0m, in \u001b[0;36mSQLGenerator._query_tasks\u001b[0;34m(self, question_str, data_info, sample_queries, table_name)\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ve:\n\u001b[0;32m--> 239\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ve\n\u001b[1;32m 240\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m se:\n\u001b[1;32m 241\u001b[0m _, ex_value, _ \u001b[38;5;241m=\u001b[39m sys\u001b[38;5;241m.\u001b[39mexc_info()\n", - "File \u001b[0;32m~/projects/code/sql-sidekick/sidekick/query.py:227\u001b[0m, in \u001b[0;36mSQLGenerator._query_tasks\u001b[0;34m(self, question_str, data_info, sample_queries, table_name)\u001b[0m\n\u001b[1;32m 225\u001b[0m m_name \u001b[38;5;241m=\u001b[39m MODEL_CHOICE_MAP\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_name)\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m m_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid model name \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Available models: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mMODEL_CHOICE_MAP\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 229\u001b[0m completion \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopenai_client\u001b[38;5;241m.\u001b[39mchat\u001b[38;5;241m.\u001b[39mcompletions\u001b[38;5;241m.\u001b[39mcreate(\n\u001b[1;32m 230\u001b[0m model\u001b[38;5;241m=\u001b[39mm_name,\n\u001b[1;32m 231\u001b[0m messages\u001b[38;5;241m=\u001b[39mquery_txt,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 234\u001b[0m temperature\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.7\u001b[39m\n\u001b[1;32m 235\u001b[0m )\n\u001b[1;32m 236\u001b[0m res \u001b[38;5;241m=\u001b[39m completion\u001b[38;5;241m.\u001b[39mchoices[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mmessage\u001b[38;5;241m.\u001b[39mcontent\n", - "\u001b[0;31mValueError\u001b[0m: Invalid model name gpt-4o. Available models: dict_keys(['h2ogpt-sql-sqlcoder2-4bit', 'h2ogpt-sql-sqlcoder-34b-alpha-4bit', 'h2ogpt-sql-nsql-llama-2-7B-4bit', 'h2ogpt-sql-sqlcoder2', 'h2ogpt-sql-sqlcoder-7b-2', 'h2ogpt-sql-sqlcoder-34b-alpha', 'h2ogpt-sql-nsql-llama-2-7B', 'gpt-3.5-turbo', 'gpt-4-8k', 'gpt-4-1106-preview-128k', 'gpt-4'])" - ] - } - ], + "outputs": [], "source": [ "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\",\n", " table_info_path=table_info_path, sample_qna_path=None)" @@ -309,27 +176,7 @@ "execution_count": null, "id": "b17e2b4f-8736-4d44-addc-db8d2be4ce51", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Question = Something went wrong while generating response. Please check the supplied API Keys and try again.\n", - "----\n" - ] - }, - { - "ename": "IndexError", - "evalue": "list index out of range", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQuestion = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m----\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGenerated SQL = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mres[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;241m1\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range" - ] - } - ], + "outputs": [], "source": [ "print(f\"Question = {res[0][0]}\")\n", "print(\"----\")\n", @@ -341,160 +188,7 @@ "execution_count": null, "id": "03c5dfc0-c6f0-4573-b36d-56dc7bcbe8bc", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:02.898\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m504\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mOpenAI key found.\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.900\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m538\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.900\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m539\u001b[0m - \u001b[1mQuestion: What are the most common occupations among individuals in the dataset?\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.901\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m557\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.901\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m358\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.902\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.902\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mTotal Memory: 7GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.903\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m366\u001b[0m - \u001b[1mFree GPU memory: 6GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:02.903\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_embedding_model\u001b[0m:\u001b[36m103\u001b[0m - \u001b[34m\u001b[1mLoading embedding model from: .//models/sentence_transformers\u001b[0m\n", - "Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 212511.40it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n", - "Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[32m2024-06-12 16:27:03.359\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:03.363\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:03.364\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m372\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:03.365\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m_query_tasks\u001b[0m:\u001b[36m210\u001b[0m - \u001b[1mCALLING OPENAI API\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:03.365\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m_query_tasks\u001b[0m:\u001b[36m228\u001b[0m - \u001b[1mCALLING OPENAI API\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Name: ->>>>>>>>>>>>>>> gpt-4\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", - "2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", - "3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", - "4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:09.104\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m607\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:09.105\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m427\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", - " \n", - " ### System: Act as a SQL Expert\n", - " # For table ['sleep_health_eda'], given an input *Question*, only generate syntactically correct sqlite SQL queries.\n", - " # Let's work it out in a detailed step by step way using the reasoning from *Tasks* section.\n", - " # Pick the SQL query which has the highest average log probability if more than one result is likely to answer the\n", - " candidate *Question*.\n", - " ### sqlite SQL tables\n", - " ### *Data:* \n", - "For table ['sleep_health_eda'] schema info is mentioned below,\n", - "\n", - "{\"Column Name\": \"Person_ID\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Gender\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Male\", \"Female\"]}\n", - "{\"Column Name\": \"Age\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Occupation\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Nurse\", \"Salesperson\", \"Manager\", \"Engineer\", \"Software Engineer\", \"Sales Representative\", \"Scientist\", \"Doctor\", \"Teacher\", \"Accountant\"]}\n", - "{\"Column Name\": \"Sleep_Duration\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Quality_of_Sleep\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Physical_Activity_Level\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Stress_Level\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"BMI_Category\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Overweight\", \"Normal\", \"Obese\", \"Normal Weight\"]}\n", - "{\"Column Name\": \"Blood_Pressure\", \"Column Type\": \"TEXT\"}\n", - "{\"Column Name\": \"Heart_Rate\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Daily_Steps\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Sleep_Disorder\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"None\", \"Sleep Apnea\", \"Insomnia\"]}\n", - " ### *History*:\n", - "\n", - " ### *Question*: For table ['sleep_health_eda'], What are the most common occupations among individuals in the dataset?\n", - " # SELECT 1\n", - " ### *Plan for table ['sleep_health_eda']*:\n", - "# 1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", - "# 2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", - "# 3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", - "# 4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n", - " ### *Policies for SQL generation*:\n", - " # Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", - " # Avoid patterns that might be vulnerable to SQL injection\n", - " # Use values and column names that are explicitly mentioned in the question or in the *Data* section.\n", - " # DO NOT query for columns that do not exist\n", - " # Validate column names with the table name when needed\n", - " # DO NOT USE aggregate and window function together\n", - " # Use COUNT(1) instead of COUNT(*)\n", - " # Return with LIMIT 100\n", - " # Prefer NOT EXISTS to LEFT JOIN ON null id\n", - " # Avoid using the WITH statement\n", - " # When using DESC keep NULLs at the end\n", - " # Always cast the numerator as float when computing ratios\n", - " # If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", - " # Use prepared statements with parameterized queries to prevent SQL injection\n", - " # Add explanation and reasoning for each SQL query\n", - " \u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:llama_index.service_context:chunk_size_limit is deprecated, please specify chunk_size instead\n", - "chunk_size_limit is deprecated, please specify chunk_size instead\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:llama_index.indices.struct_store.sql_query:> Table desc str: Schema of table sleep_health_eda:\n", - "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", - "\n", - "> Table desc str: Schema of table sleep_health_eda:\n", - "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", - "\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:16.212\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m612\u001b[0m - \u001b[1mInput query: What are the most common occupations among individuals in the dataset?\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.213\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m613\u001b[0m - \u001b[1mGenerated response:\n", - "\n", - "SELECT \"Occupation\", COUNT(1) AS \"Count\" FROM \"sleep_health_eda\" GROUP BY \"Occupation\" ORDER BY \"Count\" DESC LIMIT 100\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m639\u001b[0m - \u001b[1mAlternate responses:\n", - "\n", - "[]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exiting...\n" - ] - } - ], + "outputs": [], "source": [ "# On using re-generation flag we toggle the temperature values between 0 and 1 alternating between low\n", "# (focus/conservative generation and high values (random/creative generation)\n", @@ -507,21 +201,7 @@ "execution_count": null, "id": "cf2fc33d-ea21-4ab2-9019-329f5bc2051d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Question = I am thinking step by step: \n", - "\n", - "----\n", - "Generated SQL = 1. Scan through the sleep_health_eda table and identify the column name associated with occupations. From the Data section, we see that this column is named \"Occupation\".\n", - "2. Calculate the count of each unique occupation present in the \"Occupation\" column. This involves grouping the data by occupation and counting the number of entries for each.\n", - "3. Sort the calculated counts in descending order. This will allow us to see the occupations with the most counts at the top.\n", - "4. Return the occupations and their respective counts from the query. The return type inferred is a list of occupations and their corresponding counts.\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"Question = {res[0][0]}\")\n", "print(\"----\")\n", @@ -533,160 +213,7 @@ "execution_count": null, "id": "b47bef8d-c991-4581-a7fc-23a056911c3f", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:16.225\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m504\u001b[0m - \u001b[1mTable in use: ['sleep_health_eda']\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.226\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m505\u001b[0m - \u001b[1mSQL dialect for generation: sqlite\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m534\u001b[0m - \u001b[1mOpenAI key found.\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m538\u001b[0m - \u001b[1mSetting context...\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.228\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m539\u001b[0m - \u001b[1mQuestion: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.228\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m557\u001b[0m - \u001b[34m\u001b[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.229\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m358\u001b[0m - \u001b[1mNumber of GPUs: 1\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.229\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m362\u001b[0m - \u001b[34m\u001b[1mInformation on device: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.230\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m365\u001b[0m - \u001b[1mTotal Memory: 7GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.231\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mis_resource_low\u001b[0m:\u001b[36m366\u001b[0m - \u001b[1mFree GPU memory: 5GB\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.232\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36mload_embedding_model\u001b[0m:\u001b[36m103\u001b[0m - \u001b[34m\u001b[1mLoading embedding model from: .//models/sentence_transformers\u001b[0m\n", - "Fetching 19 files: 100%|██████████| 19/19 [00:00<00:00, 189742.32it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n", - "Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/b737bf5dcc6ee8bdc530531266b4804a5d77b5d8/\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "\u001b[32m2024-06-12 16:27:16.611\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.utils\u001b[0m:\u001b[36m_check_file_info\u001b[0m:\u001b[36m472\u001b[0m - \u001b[1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m353\u001b[0m - \u001b[1mNumber of context queries found: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_tasks\u001b[0m:\u001b[36m372\u001b[0m - \u001b[1mNumber of possible contextual queries to question: 0\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.617\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m_query_tasks\u001b[0m:\u001b[36m210\u001b[0m - \u001b[1mCALLING OPENAI API\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:16.617\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36m_query_tasks\u001b[0m:\u001b[36m228\u001b[0m - \u001b[1mCALLING OPENAI API\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Name: ->>>>>>>>>>>>>>> gpt-4\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "1. Identify the relevant columns in the question. In this case, we are interested in 'Gender' and 'Sleep_Duration'.\n", - "2. Aggregate the data by 'Gender' to split the data into different groups according to gender.\n", - "3. For each 'Gender' group, calculate the average 'Sleep_Duration'.\n", - "4. Return the 'Gender' along with the corresponding average 'Sleep_Duration' for each gender.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:20.448\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m607\u001b[0m - \u001b[1mComputing user request ...\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:20.450\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36msidekick.query\u001b[0m:\u001b[36mgenerate_sql\u001b[0m:\u001b[36m427\u001b[0m - \u001b[34m\u001b[1mQuery Text:\n", - " \n", - " ### System: Act as a SQL Expert\n", - " # For table ['sleep_health_eda'], given an input *Question*, only generate syntactically correct sqlite SQL queries.\n", - " # Let's work it out in a detailed step by step way using the reasoning from *Tasks* section.\n", - " # Pick the SQL query which has the highest average log probability if more than one result is likely to answer the\n", - " candidate *Question*.\n", - " ### sqlite SQL tables\n", - " ### *Data:* \n", - "For table ['sleep_health_eda'] schema info is mentioned below,\n", - "\n", - "{\"Column Name\": \"Person_ID\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Gender\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Male\", \"Female\"]}\n", - "{\"Column Name\": \"Age\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Occupation\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Nurse\", \"Salesperson\", \"Manager\", \"Engineer\", \"Software Engineer\", \"Sales Representative\", \"Scientist\", \"Doctor\", \"Teacher\", \"Accountant\"]}\n", - "{\"Column Name\": \"Sleep_Duration\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Quality_of_Sleep\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Physical_Activity_Level\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Stress_Level\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"BMI_Category\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"Overweight\", \"Normal\", \"Obese\", \"Normal Weight\"]}\n", - "{\"Column Name\": \"Blood_Pressure\", \"Column Type\": \"TEXT\"}\n", - "{\"Column Name\": \"Heart_Rate\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Daily_Steps\", \"Column Type\": \"NUMERIC\"}\n", - "{\"Column Name\": \"Sleep_Disorder\", \"Column Type\": \"TEXT\", \"Sample Values\": [\"None\", \"Sleep Apnea\", \"Insomnia\"]}\n", - " ### *History*:\n", - "\n", - " ### *Question*: For table ['sleep_health_eda'], What is the average sleep duration for each gender?\n", - " # SELECT 1\n", - " ### *Plan for table ['sleep_health_eda']*:\n", - "# 1. Identify the relevant columns in the question. In this case, we are interested in 'Gender' and 'Sleep_Duration'.\n", - "# 2. Aggregate the data by 'Gender' to split the data into different groups according to gender.\n", - "# 3. For each 'Gender' group, calculate the average 'Sleep_Duration'.\n", - "# 4. Return the 'Gender' along with the corresponding average 'Sleep_Duration' for each gender.\n", - " ### *Policies for SQL generation*:\n", - " # Avoid overly complex SQL queries, favor concise human readable SQL queries which are easy to understand and debug\n", - " # Avoid patterns that might be vulnerable to SQL injection\n", - " # Use values and column names that are explicitly mentioned in the question or in the *Data* section.\n", - " # DO NOT query for columns that do not exist\n", - " # Validate column names with the table name when needed\n", - " # DO NOT USE aggregate and window function together\n", - " # Use COUNT(1) instead of COUNT(*)\n", - " # Return with LIMIT 100\n", - " # Prefer NOT EXISTS to LEFT JOIN ON null id\n", - " # Avoid using the WITH statement\n", - " # When using DESC keep NULLs at the end\n", - " # Always cast the numerator as float when computing ratios\n", - " # If JSONB format found in Table schema, do pattern matching on keywords from the question and use SQL functions such as ->> or ->\n", - " # Use prepared statements with parameterized queries to prevent SQL injection\n", - " # Add explanation and reasoning for each SQL query\n", - " \u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:llama_index.service_context:chunk_size_limit is deprecated, please specify chunk_size instead\n", - "chunk_size_limit is deprecated, please specify chunk_size instead\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:llama_index.indices.struct_store.sql_query:> Table desc str: Schema of table sleep_health_eda:\n", - "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", - "\n", - "> Table desc str: Schema of table sleep_health_eda:\n", - "Table 'sleep_health_eda' has columns: Person_ID (NUMERIC), Gender (TEXT), Age (NUMERIC), Occupation (TEXT), Sleep_Duration (NUMERIC), Quality_of_Sleep (NUMERIC), Physical_Activity_Level (NUMERIC), Stress_Level (NUMERIC), BMI_Category (TEXT), Blood_Pressure (TEXT), Heart_Rate (NUMERIC), Daily_Steps (NUMERIC), Sleep_Disorder (TEXT), and foreign keys: .\n", - "\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-06-12 16:27:23.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m612\u001b[0m - \u001b[1mInput query: What is the average sleep duration for each gender?\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:23.838\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m613\u001b[0m - \u001b[1mGenerated response:\n", - "\n", - "SELECT \"Gender\", AVG(\"Sleep_Duration\") AS \"Avg_Sleep_Duration\" FROM \"sleep_health_eda\" GROUP BY \"Gender\" ORDER BY \"Avg_Sleep_Duration\" DESC\u001b[0m\n", - "\u001b[32m2024-06-12 16:27:23.840\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36msidekick.prompter\u001b[0m:\u001b[36mask\u001b[0m:\u001b[36m639\u001b[0m - \u001b[1mAlternate responses:\n", - "\n", - "[]\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Exiting...\n" - ] - } - ], + "outputs": [], "source": [ "# Alternate options\n", "res = query(\"What is the average sleep duration for each gender?\", table_name=\"sleep_health_eda\",\n", @@ -698,17 +225,7 @@ "execution_count": null, "id": "e415c0b9-466e-4417-ac1e-493914a83c36", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Question = I am thinking step by step: \n", - "\n", - "----Options----\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"Question = {res[0][0]}\")\n", "print(\"----Options----\")\n", diff --git a/sidekick/query.py b/sidekick/query.py index 9eace8f..d150154 100644 --- a/sidekick/query.py +++ b/sidekick/query.py @@ -249,7 +249,12 @@ def self_correction(self, error_msg, input_query, remote_url, client_key): user_prompt = DEBUGGING_PROMPT["user_prompt"].format(ex_traceback=error_msg, qry_txt=input_query).strip() _response = [] _res = input_query - self_correction_model = os.getenv("SELF_CORRECTION_MODEL", "h2oai/h2ogpt-4096-llama2-70b-chat") + if os.getenv("OPENAI_API_KEY", None): + default_correction_model = "gpt-4" + else: + default_correction_model = "h2ogpt-4096-llama2-70b-chat" + self_correction_model = os.getenv("SELF_CORRECTION_MODEL", default_correction_model) + logger.info(f"Using LLM model: {self_correction_model} for self-correction") if "h2ogpt-" in self_correction_model: if remote_url and client_key and remote_url != "" and client_key != "": from h2ogpte import H2OGPTE @@ -314,6 +319,7 @@ def generate_response( res = response.metadata["sql_query"] return res except Exception as se: + logger.info(f"Error in generating response: {se}") # Take the SQL and make an attempt for correction _, ex_value, ex_traceback = sys.exc_info() qry_txt = ex_value.statement @@ -341,6 +347,7 @@ def generate_response( res = qry_txt return res except Exception as se: + logger.info(f"Error in generate_response, self correction: {se}") # Another exception occurred, return the original SQL res = qry_txt return res diff --git a/sidekick/utils.py b/sidekick/utils.py index ecf205c..04c6a89 100644 --- a/sidekick/utils.py +++ b/sidekick/utils.py @@ -25,7 +25,10 @@ REMOTE_LLMS = ["h2ogpt-sql-sqlcoder-34b-alpha", "h2ogpt-sql-sqlcoder2", "h2ogpt-sql-nsql-llama-2-7B", - "h2ogpt-sql-sqlcoder-7b-2", "gpt-3.5-turbo", "gpt-4-8k", "gpt-4-1106-preview-128k"] + "h2ogpt-sql-sqlcoder-7b-2", "gpt-3.5-turbo", "gpt-4-8k", "gpt-4-1106-preview-128k", + "gpt-4o", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4-turbo-preview", "gpt-4-0125-preview", + "gpt-4-vision-preview", "gpt-4-1106-vision-preview", "gpt-4", "gpt-4-0613", "gpt-4-32k", + "gpt-4-32k-0613"] # clone of models from https://huggingface.co/models # suffix `h2ogpt-sql-` is added to avoid conflict with the original models (we haven't done any changes to the original models yet) @@ -38,9 +41,19 @@ "h2ogpt-sql-sqlcoder-34b-alpha": "defog/sqlcoder-34b-alpha", "h2ogpt-sql-nsql-llama-2-7B": "NumbersStation/nsql-llama-2-7B", "gpt-3.5-turbo": "gpt-3.5-turbo-1106", - "gpt-4-8k": "gpt-4", - "gpt-4-1106-preview-128k": "gpt-4-1106-preview", + "gpt-4-8k": "gpt-4", # leaving this for backward compatibility + "gpt-4-1106-preview-128k": "gpt-4-1106-preview", # leaving this for backward compatibility + "gpt-4o": "gpt-4o", + "gpt-4-turbo": "gpt-4-turbo", + "gpt-4-turbo-2024-04-09": "gpt-4-turbo-2024-04-09", + "gpt-4-turbo-preview": "gpt-4-turbo-preview", + "gpt-4-0125-preview": "gpt-4-0125-preview", + "gpt-4-vision-preview": "gpt-4o", # legacy to be deprecated + "gpt-4-1106-vision-preview": "gpt-4o", # legacy to be deprecated "gpt-4": "gpt-4", + "gpt-4-0613": "gpt-4-0613", + "gpt-4-32k": "gpt-4-turbo", # legacy to be deprecated + "gpt-4-32k-0613": "gpt-4-turbo", # legacy to be deprecated } MODEL_CHOICE_MAP_DEFAULT = { @@ -573,7 +586,12 @@ def check_vulnerability(input_query: str): _user_prompt = GUARDRAIL_PROMPT["user_prompt"].format(query_txt=input_query, schema=output_schema).strip() temp_result = None try: - llm_scanner = os.getenv("VULNERABILITY_SCANNER", "h2oai/h2ogpt-4096-llama2-70b-chat") + if os.getenv("OPENAI_API_KEY", None): + default_scanner_model = "gpt-4" + else: + default_scanner_model = "h2ogpt-4096-llama2-70b-chat" + llm_scanner = os.getenv("VULNERABILITY_SCANNER", default_scanner_model) + logger.info(f"Using LLM model: {llm_scanner} for vulnerability scan") if "h2ogpt-" in llm_scanner and h2ogpte_client_url !='' and h2ogpte_client_url and h2ogpte_client_key != '' and h2ogpte_client_key: from h2ogpte import H2OGPTE client = H2OGPTE(address=h2ogpte_client_url, api_key=h2ogpte_client_key) @@ -636,7 +654,12 @@ def generate_suggestions(remote_url, client_key:str, column_names: list, n_qs: i _user_prompt = RECOMMENDATION_PROMPT.format(data_schema=column_info, n_questions=n_qs ) - recommender_model = os.getenv("RECOMMENDATION_MODEL", "h2oai/h2ogpt-4096-llama2-70b-chat") + if os.getenv("OPENAI_API_KEY", None): + default_recommendation_model = "gpt-4" + else: + default_recommendation_model = "h2ogpt-4096-llama2-70b-chat" + recommender_model = os.getenv("RECOMMENDATION_MODEL", default_recommendation_model) + logger.info(f"Using LLM model: {recommender_model} for recommendation") if "h2ogpt-" in recommender_model: try: client = H2OGPTE(address=remote_url, api_key=client_key)