From 0d705dff3600df0ccf22289656aac3e909c6649a Mon Sep 17 00:00:00 2001
From: Devanshu-gif <23f2004399@ds.study.iitm.ac.in>
Date: Wed, 30 Jul 2025 02:22:24 +0530
Subject: [PATCH 1/3] updated results

---
 app.py                 | 195 ++++++++++++++++++++++++++++-------------
 requirements.txt       |   4 +-
 templates/index.html   |  55 +++++++++++-
 templates/results.html |  72 +++++++++++----
 4 files changed, 245 insertions(+), 81 deletions(-)
diff --git a/app.py b/app.py
index d33013e..4b719c6 100644
--- a/app.py
+++ b/app.py
@@ -3,21 +3,30 @@
 import pandas as pd
 import xml.etree.ElementTree as ET
 import re
+import requests
+import time
 from flask import Flask, render_template, request, send_from_directory, flash, redirect, url_for
 from werkzeug.utils import secure_filename
+import yaml
 
 # --- Configuration ---
-# Define the paths for file uploads and processed files.
-# It's good practice to use absolute paths.
+# IMPORTANT: You must get your own API key from Google AI Studio.
+# https://aistudio.google.com/app/apikey
+with open("key.yaml", "r") as f:
+    config = yaml.safe_load(f)
+GEMINI_API_KEY = config.get("GEMINI_API_KEY")
+GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={GEMINI_API_KEY}"
+
 UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
 PROCESSED_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'processed')
-ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml'}
+# --- UPDATED: Added 'csv' to allowed extensions ---
+ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml', 'csv'}
 
 # --- App Initialization ---
 app = Flask(__name__)
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 app.config['PROCESSED_FOLDER'] = PROCESSED_FOLDER
-app.config['SECRET_KEY'] = 'supersecretkey' # Change this in a real application
+app.config['SECRET_KEY'] = 'supersecretkey'
 
 # --- Helper Functions ---
 
@@ -26,54 +35,60 @@ def allowed_file(filename):
     return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 
-def parse_log_file(file_path):
-    """
-    A generic log parser. This is a basic example.
-    Real-world log files can be very complex. This function tries to extract
-    common patterns like IP addresses, timestamps, and request methods.
-    You will likely need to customize this regex for your specific log formats.
-    """
-    # Example Regex: Captures IP, timestamp, request method/path, status, and size.
-    # This is a common pattern for web server logs (e.g., Apache, Nginx).
-    log_pattern = re.compile(r'(?P<ip>\S+) \S+ \S+ \[(?P<timestamp>.*?)\] "(?P<request>.*?)" (?P<status>\d{3}) (?P<size>\S+)')
-    data = []
-    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-        for line in f:
-            match = log_pattern.match(line)
-            if match:
-                data.append(match.groupdict())
-    if not data:
-        # Fallback for unstructured logs: treat each line as a single message.
-        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-            data = [{'message': line.strip()} for line in f]
-    return pd.DataFrame(data)
-
-def parse_json_file(file_path):
+def get_regex_from_gemini(log_sample):
     """
-    Parses a JSON file. Handles both standard JSON and line-delimited JSON (JSONL).
+    Sends a sample of the log file to the Gemini API to determine a parsing regex.
+    This version asks for the raw regex string directly to avoid JSON formatting errors.
     """
-    try:
-        # Try parsing as a standard JSON array of objects
-        df = pd.read_json(file_path)
-    except ValueError:
-        # If that fails, try parsing as line-delimited JSON
-        with open(file_path, 'r') as f:
-            data = [json.loads(line) for line in f]
-        df = pd.DataFrame(data)
-    return df
-
-def parse_xml_file(file_path):
+    if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
+        raise ValueError("Gemini API key is not configured. Please set it in your environment or in the script.")
+
+    # --- UPDATED PROMPT ---
+    # Asks for the raw regex string, not a JSON object. This is more robust.
+    prompt = f"""
+    Analyze the following log data sample. Your task is to generate a single Python-compatible regular expression (regex) that can capture the distinct columns.
+
+    Data Sample:
+    ---
+    {log_sample}
+    ---
+
+    Instructions:
+    1.  Create a Python regex string with named capture groups (e.g., `?P<group_name>...`). The group names should be concise, descriptive, and in snake_case. These names will become the column headers.
+    2.  **CRITICAL RULE:** If a timestamp (containing date, time, or both) is present, you MUST capture the entire timestamp in a single group named `timestamp`. Do NOT split it into multiple columns.
+    3.  The regex MUST account for variable content, like messages that can contain spaces. Use non-greedy matching (`.*?`) for such fields.
+    4.  The regex should match the entire line from start (`^`) to end (`$`).
+    5.  **Return ONLY the raw regex string and absolutely nothing else.** Do not wrap it in quotes, markdown, or JSON.
+
+    Example output for a log like "2025-07-27 19:56:34 [INFO] Quality check passed Machine-007 Operator-07":
+    ^(?P<timestamp>\\d{{4}}-\\d{{2}}-\\d{{2}} \\d{{2}}:\\d{{2}}:\\d{{2}}) \\[(?P<log_level>\\w+)\\] (?P<message>.*?) (?P<machine_id>Machine-\\d{{3}}) (?P<operator_id>Operator-\\d{{2}})$
     """
-    Parses an XML file. Assumes a structure where the root has many
-    child elements, and each child is a record.
-    """
-    tree = ET.parse(file_path)
-    root = tree.getroot()
-    data = []
-    for elem in root:
-        record = {child.tag: child.text for child in elem}
-        data.append(record)
-    return pd.DataFrame(data)
+
+    payload = {
+        "contents": [{"parts": [{"text": prompt}]}]
+    }
+    
+    headers = {'Content-Type': 'application/json'}
+    
+    # --- Retry Logic ---
+    retries = 3
+    backoff_factor = 1
+    for i in range(retries):
+        try:
+            response = requests.post(GEMINI_API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            if e.response is not None and e.response.status_code == 503 and i < retries - 1:
+                print(f"Service unavailable, retrying in {backoff_factor} seconds...")
+                time.sleep(backoff_factor)
+                backoff_factor *= 2
+                continue
+            else:
+                raise e
+    
+    raise requests.exceptions.RequestException("Failed to get a response from the API after several retries.")
+
 
 # --- Flask Routes ---
 
@@ -85,15 +100,16 @@ def index():
 @app.route('/upload', methods=['POST'])
 def upload_file():
     """Handles the file upload and processing logic."""
+    # --- FIXED: Redirect to 'index' on error ---
     if 'file' not in request.files:
         flash('No file part')
-        return redirect(request.url)
+        return redirect(url_for('index'))
     
     file = request.files['file']
 
     if file.filename == '':
         flash('No selected file')
-        return redirect(request.url)
+        return redirect(url_for('index'))
 
     if file and allowed_file(file.filename):
         filename = secure_filename(file.filename)
@@ -101,46 +117,99 @@ def upload_file():
         file.save(upload_path)
 
         try:
-            # Determine file type and parse accordingly
             ext = filename.rsplit('.', 1)[1].lower()
             df = None
+            
             if ext in ['log', 'txt']:
-                df = parse_log_file(upload_path)
+                with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    log_sample = "".join(f.readlines(10))
+                
+                if not log_sample:
+                    flash("File is empty.")
+                    return redirect(url_for('index'))
+
+                # --- NEW ROBUST PARSING ---
+                gemini_response = get_regex_from_gemini(log_sample)
+                
+                # Safely extract the text from the API response
+                regex_pattern = gemini_response.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text')
+
+                if not regex_pattern:
+                    raise ValueError("LLM did not return a regex pattern.")
+
+                # Clean up potential markdown formatting from the response
+                if regex_pattern.strip().startswith("```"):
+                    regex_pattern = re.sub(r'```(python)?\n', '', regex_pattern)
+                    regex_pattern = regex_pattern.strip().replace('```', '')
+
+                print("\n--- Generated Regex Pattern ---")
+                print(regex_pattern.strip())
+                print("-----------------------------\n")
+
+                pattern = re.compile(regex_pattern.strip())
+                parsed_data = []
+                with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
+                    for line in f:
+                        match = pattern.match(line.strip())
+                        if match:
+                            parsed_data.append(match.groupdict())
+                
+                df = pd.DataFrame(parsed_data)
+
             elif ext == 'json':
-                df = parse_json_file(upload_path)
+                try:
+                    with open(upload_path, 'r', encoding='utf-8') as f:
+                        json_data = json.load(f)
+                    df = pd.json_normalize(json_data)
+                except (json.JSONDecodeError, TypeError):
+                    df = pd.read_json(upload_path, lines=True, encoding='utf-8')
+
             elif ext == 'xml':
-                df = parse_xml_file(upload_path)
+                tree = ET.parse(upload_path)
+                root = tree.getroot()
+                records = [{child.tag: child.text for child in elem} for elem in root]
+                df = pd.DataFrame(records)
+            
+            # --- NEW: Added direct CSV handling ---
+            elif ext == 'csv':
+                df = pd.read_csv(upload_path)
+
 
             if df is None or df.empty:
                 flash('Could not parse the file. The format might be unsupported or the file is empty.')
                 return redirect(url_for('index'))
 
-            # Create the output CSV
             base_filename = filename.rsplit('.', 1)[0]
-            csv_filename = f"{base_filename}.csv"
+            csv_filename = f"{base_filename}_processed.csv" # Suffix to avoid name collision
             csv_path = os.path.join(app.config['PROCESSED_FOLDER'], csv_filename)
             df.to_csv(csv_path, index=False)
             
-            # Redirect to the results page
-            return render_template('results.html', csv_filename=csv_filename)
+            preview_df = pd.read_csv(csv_path)
+            preview_limit = 100
+            preview_headers = preview_df.columns.values.tolist()
+            rows = preview_df.head(preview_limit).values.tolist()
+            
+            return render_template('results.html', 
+                                   csv_filename=csv_filename,
+                                   headers=preview_headers,
+                                   rows=rows)
 
         except Exception as e:
-            flash(f'An error occurred while processing the file: {e}')
+            flash(f'An error occurred: {e}')
             return redirect(url_for('index'))
 
     else:
+        # --- FIXED: Redirect to 'index' on error ---
         flash('File type not allowed.')
-        return redirect(request.url)
+        return redirect(url_for('index'))
 
 @app.route('/download/<filename>')
 def download_file(filename):
     """Serves the processed CSV file for download."""
     return send_from_directory(app.config['PROCESSED_FOLDER'], filename, as_attachment=True)
 
-
 # --- Main Execution ---
 if __name__ == '__main__':
-    # Create upload and processed directories if they don't exist
     os.makedirs(UPLOAD_FOLDER, exist_ok=True)
     os.makedirs(PROCESSED_FOLDER, exist_ok=True)
     app.run(debug=True)
diff --git a/requirements.txt b/requirements.txt
index e5ca5cc..7048bde 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
 flask
 pandas
-lxmal
\ No newline at end of file
+lxmal
+requests
+PyYAML
\ No newline at end of file
diff --git a/templates/index.html b/templates/index.html
index 0ca0dbc..765375c 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -1,4 +1,3 @@
-
 <!-- templates/index.html -->
 <!DOCTYPE html>
 <html lang="en">
@@ -38,7 +37,9 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
         <form action="/upload" method="post" enctype="multipart/form-data" class="space-y-6">
             <div>
                 <label for="file" class="block text-sm font-medium text-gray-700">Log File</label>
-                <div class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">
+                
+                <!-- This is the drag-and-drop upload box -->
+                <div id="upload-box" class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">
                     <div class="space-y-1 text-center">
                         <svg class="mx-auto h-12 w-12 text-gray-400" stroke="currentColor" fill="none" viewBox="0 0 48 48" aria-hidden="true">
                             <path d="M28 8H12a4 4 0 00-4 4v20m32-12v8m0 0v8a4 4 0 01-4 4H12a4 4 0 01-4-4v-4m32-4l-3.172-3.172a4 4 0 00-5.656 0L28 28M8 32l9.172-9.172a4 4 0 015.656 0L28 28" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" />
@@ -53,6 +54,18 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
                         <p class="text-xs text-gray-500">.log, .txt, .json, .xml up to 50MB</p>
                     </div>
                 </div>
+
+                <!-- This block will be shown after a file is selected -->
+                <div id="file-info" class="hidden mt-2 flex items-center justify-between p-4 border border-gray-300 rounded-md bg-gray-50">
+                    <div class="flex items-center">
+                        <svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6 text-green-500" viewBox="0 0 20 20" fill="currentColor">
+                          <path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
+                        </svg>
+                        <span id="file-name" class="text-sm text-gray-800 ml-3 font-medium"></span>
+                    </div>
+                    <button id="remove-file" type="button" class="text-sm font-medium text-red-600 hover:text-red-800">Remove</button>
+                </div>
+
             </div>
             <div>
                 <button type="submit" class="w-full flex justify-center py-3 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
@@ -62,5 +75,43 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
         </form>
     </div>
 
+    <script>
+        // Get references to the HTML elements
+        const fileUploadInput = document.getElementById('file-upload');
+        const uploadBox = document.getElementById('upload-box');
+        const fileInfo = document.getElementById('file-info');
+        const fileNameSpan = document.getElementById('file-name');
+        const removeFileBtn = document.getElementById('remove-file');
+
+        // Listen for changes on the file input
+        fileUploadInput.addEventListener('change', () => {
+            // Check if any file is selected
+            if (fileUploadInput.files.length > 0) {
+                const fileName = fileUploadInput.files[0].name;
+                
+                // Display the selected file's name
+                fileNameSpan.textContent = fileName;
+                
+                // Hide the original upload box
+                uploadBox.classList.add('hidden');
+                
+                // Show the file info box with the remove button
+                fileInfo.classList.remove('hidden');
+            }
+        });
+
+        // Listen for clicks on the "Remove" button
+        removeFileBtn.addEventListener('click', () => {
+            // Clear the selected file from the input
+            fileUploadInput.value = ''; 
+            
+            // Show the original upload box again
+            uploadBox.classList.remove('hidden');
+            
+            // Hide the file info box
+            fileInfo.classList.add('hidden');
+        });
+    </script>
+
 </body>
 </html>
\ No newline at end of file
diff --git a/templates/results.html b/templates/results.html
index 8c8c3a2..37ef5e4 100644
--- a/templates/results.html
+++ b/templates/results.html
@@ -11,27 +11,69 @@
     <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
     <style>
         body { font-family: 'Inter', sans-serif; }
+        /* Style for sticky table header */
+        .sticky-header th {
+            position: sticky;
+            top: 0;
+            /* The background color is now applied directly via Tailwind classes for consistency */
+            z-index: 10;
+        }
     </style>
 </head>
-<body class="bg-gray-100 text-gray-800 flex items-center justify-center min-h-screen">
+<body class="bg-gray-100 text-gray-800">
 
-    <div class="w-full max-w-lg mx-auto bg-white rounded-xl shadow-lg p-8 md:p-12 text-center">
-        <div class="mx-auto flex items-center justify-center h-16 w-16 rounded-full bg-green-100 mb-6">
-            <svg class="h-8 w-8 text-green-600" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
-                <path stroke-linecap="round" stroke-linejoin="round" d="M4.5 12.75l6 6 9-13.5" />
-            </svg>
+    <div class="w-full max-w-7xl mx-auto bg-white rounded-xl shadow-lg my-12 p-8 md:p-12">
+        <div class="text-center">
+            <div class="mx-auto flex items-center justify-center h-16 w-16 rounded-full bg-green-100 mb-6">
+                <svg class="h-8 w-8 text-green-600" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor">
+                    <path stroke-linecap="round" stroke-linejoin="round" d="M4.5 12.75l6 6 9-13.5" />
+                </svg>
+            </div>
+            <h1 class="text-2xl md:text-3xl font-bold text-gray-900">Success!</h1>
+            <p class="text-gray-600 mt-2">Your file has been processed. Preview the first 100 rows below.</p>
         </div>
-        <h1 class="text-2xl md:text-3xl font-bold text-gray-900">Success!</h1>
-        <p class="text-gray-600 mt-2 mb-8">Your file has been processed and converted to CSV.</p>
-        
-        <a href="{{ url_for('download_file', filename=csv_filename) }}" class="w-full inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
-            Download CSV File
-        </a>
 
-        <div class="mt-8">
-            <a href="{{ url_for('index') }}" class="text-sm font-medium text-indigo-600 hover:text-indigo-500">Process another file</a>
+        <!-- Action Buttons -->
+        <div class="my-8 flex flex-col sm:flex-row justify-center items-center gap-4">
+            <a href="{{ url_for('download_file', filename=csv_filename) }}" class="w-full sm:w-auto inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
+                Download Full CSV
+            </a>
+            <a href="{{ url_for('index') }}" class="w-full sm:w-auto inline-flex justify-center items-center px-6 py-3 border border-gray-300 text-base font-medium rounded-md text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
+                Process Another File
+            </a>
+        </div>
+        
+        <!-- IMPROVED: CSV Viewer -->
+        <div class="mt-8 border border-gray-200 rounded-lg">
+            <div class="h-96 overflow-auto rounded-lg">
+                <table class="min-w-full border-collapse">
+                    <thead class="sticky-header">
+                        <tr>
+                            {% for header in headers %}
+                            <!-- Added bg-gray-100 here to make the header opaque -->
+                            <th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-600 uppercase tracking-wider border-b-2 border-r border-gray-200 bg-gray-100">
+                                {{ header }}
+                            </th>
+                            {% endfor %}
+                        </tr>
+                    </thead>
+                    <tbody class="bg-white">
+                        {% for row in rows %}
+                        <tr class="hover:bg-gray-50">
+                            {% for cell in row %}
+                            <!-- Applied consistent borders to all cells for the grid effect -->
+                            <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-700 border-b border-r border-gray-200">
+                                {{ cell }}
+                            </td>
+                            {% endfor %}
+                        </tr>
+                        {% endfor %}
+                    </tbody>
+                </table>
+            </div>
         </div>
+
     </div>
 
 </body>
-</html>
+</html>
\ No newline at end of file

From 217933bcbe6b85f3501c8b3d88b4a3d50b86177a Mon Sep 17 00:00:00 2001
From: Devanshu-gif <23f2004399@ds.study.iitm.ac.in>
Date: Wed, 6 Aug 2025 05:06:54 +0530
Subject: [PATCH 2/3] Generalised log data struture

---
 app.py                 | 175 ++++++++++++++++++++++-------------------
 templates/results.html |  34 ++++++--
 2 files changed, 121 insertions(+), 88 deletions(-)

diff --git a/app.py b/app.py
index 4b719c6..3d64af1 100644
--- a/app.py
+++ b/app.py
@@ -5,82 +5,97 @@
 import re
 import requests
 import time
-from flask import Flask, render_template, request, send_from_directory, flash, redirect, url_for
-from werkzeug.utils import secure_filename
+import io
 import yaml
+from flask import Flask, render_template, request, flash, redirect, url_for
+from werkzeug.utils import secure_filename
 
 # --- Configuration ---
-# IMPORTANT: You must get your own API key from Google AI Studio.
-# https://aistudio.google.com/app/apikey
+# Reading the key from your local key.yaml file
 with open("key.yaml", "r") as f:
     config = yaml.safe_load(f)
 GEMINI_API_KEY = config.get("GEMINI_API_KEY")
+
+# Using the correct, powerful model for this task
 GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={GEMINI_API_KEY}"
 
+# Using local folders for testing
 UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
 PROCESSED_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'processed')
-# --- UPDATED: Added 'csv' to allowed extensions ---
 ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml', 'csv'}
 
+# The canonical, final schema for all logs
+GENERIC_SCHEMA = [
+    'timestamp', 'log_level', 'message', 'service_name', 
+    'host_name', 'trace_id', 'error_details', 'metadata'
+]
+
 # --- App Initialization ---
 app = Flask(__name__)
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 app.config['PROCESSED_FOLDER'] = PROCESSED_FOLDER
-app.config['SECRET_KEY'] = 'supersecretkey'
+app.config['SECRET_KEY'] = 'a-very-secret-key-for-local-testing'
 
 # --- Helper Functions ---
 
 def allowed_file(filename):
     """Checks if the file's extension is in the ALLOWED_EXTENSIONS set."""
-    return '.' in filename and \
-           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 
 def get_regex_from_gemini(log_sample):
     """
     Sends a sample of the log file to the Gemini API to determine a parsing regex.
-    This version asks for the raw regex string directly to avoid JSON formatting errors.
     """
-    if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
-        raise ValueError("Gemini API key is not configured. Please set it in your environment or in the script.")
+    if not GEMINI_API_KEY:
+        raise ValueError("GEMINI_API_KEY not found in key.yaml.")
 
-    # --- UPDATED PROMPT ---
-    # Asks for the raw regex string, not a JSON object. This is more robust.
+    # --- UPDATED PROMPT with stricter message definition ---
     prompt = f"""
-    Analyze the following log data sample. Your task is to generate a single Python-compatible regular expression (regex) that can capture the distinct columns.
-
-    Data Sample:
+    You are an expert log data transformation engine. Your task is to analyze a raw log sample and generate a Python-compatible regular expression (regex) that extracts key information and maps it to a predefined generic schema.
+
+    **Generic Log Schema:**
+    - `timestamp`: The full timestamp of the event.
+    - `log_level`: The severity of the event (e.g., INFO, WARN, ERROR).
+    - `message`: The primary, human-readable message.
+    - `service_name`: The application or service that generated the log.
+    - `host_name`: The hostname of the machine.
+    - `trace_id`: A unique identifier for correlating logs.
+    - `error_details`: Stack trace or detailed error messages.
+    - `metadata`: A catch-all for any other structured data.
+
+    **Log Sample to Analyze:**
     ---
     {log_sample}
     ---
 
-    Instructions:
-    1.  Create a Python regex string with named capture groups (e.g., `?P<group_name>...`). The group names should be concise, descriptive, and in snake_case. These names will become the column headers.
-    2.  **CRITICAL RULE:** If a timestamp (containing date, time, or both) is present, you MUST capture the entire timestamp in a single group named `timestamp`. Do NOT split it into multiple columns.
-    3.  The regex MUST account for variable content, like messages that can contain spaces. Use non-greedy matching (`.*?`) for such fields.
-    4.  The regex should match the entire line from start (`^`) to end (`$`).
-    5.  **Return ONLY the raw regex string and absolutely nothing else.** Do not wrap it in quotes, markdown, or JSON.
-
-    Example output for a log like "2025-07-27 19:56:34 [INFO] Quality check passed Machine-007 Operator-07":
-    ^(?P<timestamp>\\d{{4}}-\\d{{2}}-\\d{{2}} \\d{{2}}:\\d{{2}}:\\d{{2}}) \\[(?P<log_level>\\w+)\\] (?P<message>.*?) (?P<machine_id>Machine-\\d{{3}}) (?P<operator_id>Operator-\\d{{2}})$
-    """
-
-    payload = {
-        "contents": [{"parts": [{"text": prompt}]}]
-    }
-    
+    **Your Instructions:**
+    1.  Create a single Python regex with named capture groups (e.g., `?P<group_name>...`).
+    2.  The name of each capture group **MUST** match a key from the **Generic Log Schema** if possible. Use the following mapping rules:
+        - For a unique identifier (like a UUID, `correlation_id`, `request_id`), name the capture group `trace_id`.
+        - For an exception or stack trace (like `exception_details`, `stack_trace`), name the capture group `error_details`.
+        - For a service or application name, use `service_name`.
+        - For a server or machine name, use `host_name`.
+    3.  If a field from the log does not logically map to any standard schema key, create a capture group with a descriptive, snake_case name (e.g., `machine_id`, `user_id`). The Python code will handle putting these into the `metadata` field.
+    4.  **CRITICAL RULE:** The `timestamp` group must capture the entire date and time component as one field.
+    5.  **CRITICAL RULE for `message`:** The `message` is the human-readable text that typically follows the timestamp and log level but comes BEFORE any other structured key-value pairs or identifiers.
+    6.  The regex should match the entire line from start (`^`) to end (`$`).
+    7.  **Return ONLY the raw regex string and absolutely nothing else.**
+
+    **Example Output for a log like "2025-08-05 17:15:21 | WARNING | Profile updated | email-dispatcher | host-02 | a870-d0a6 | TimeoutException | ...":**
+    ^(?P<timestamp>.*?)\\s*\\|\\s*(?P<log_level>\\w+)\\s*\\|\\s*(?P<message>.*?)\\s*\\|\\s*(?P<service_name>.*?)\\s*\\|\\s*(?P<host_name>.*?)\\s*\\|\\s*(?P<trace_id>.*?)\\s*\\|\\s*(?P<error_details>.*?)\\s*\\|\\s*(?P<json_payload>{{.*}})$
+    """    
+    payload = {"contents": [{"parts": [{"text": prompt}]}]}
     headers = {'Content-Type': 'application/json'}
     
-    # --- Retry Logic ---
     retries = 3
-    backoff_factor = 1
+    backoff_factor = 2
     for i in range(retries):
         try:
-            response = requests.post(GEMINI_API_URL, headers=headers, json=payload, timeout=60)
+            response = requests.post(GEMINI_API_URL, headers=headers, json=payload, timeout=90)
             response.raise_for_status()
             return response.json()
         except requests.exceptions.RequestException as e:
-            if e.response is not None and e.response.status_code == 503 and i < retries - 1:
-                print(f"Service unavailable, retrying in {backoff_factor} seconds...")
+            if e.response is not None and e.response.status_code in [503, 429] and i < retries - 1:
                 time.sleep(backoff_factor)
                 backoff_factor *= 2
                 continue
@@ -89,7 +104,6 @@ def get_regex_from_gemini(log_sample):
     
     raise requests.exceptions.RequestException("Failed to get a response from the API after several retries.")
 
-
 # --- Flask Routes ---
 
 @app.route('/', methods=['GET'])
@@ -100,7 +114,6 @@ def index():
 @app.route('/upload', methods=['POST'])
 def upload_file():
     """Handles the file upload and processing logic."""
-    # --- FIXED: Redirect to 'index' on error ---
     if 'file' not in request.files:
         flash('No file part')
         return redirect(url_for('index'))
@@ -112,32 +125,34 @@ def upload_file():
         return redirect(url_for('index'))
 
     if file and allowed_file(file.filename):
-        filename = secure_filename(file.filename)
-        upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-        file.save(upload_path)
-
         try:
+            filename = secure_filename(file.filename)
+            upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+            file.save(upload_path)
+            
+            with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
+                file_content = f.read()
+
+            if not file_content:
+                flash("The uploaded file appears to be empty.")
+                return redirect(url_for('index'))
+
             ext = filename.rsplit('.', 1)[1].lower()
             df = None
             
             if ext in ['log', 'txt']:
-                with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
-                    log_sample = "".join(f.readlines(10))
+                log_sample = "\n".join(file_content.splitlines()[:10])
                 
                 if not log_sample:
                     flash("File is empty.")
                     return redirect(url_for('index'))
 
-                # --- NEW ROBUST PARSING ---
                 gemini_response = get_regex_from_gemini(log_sample)
-                
-                # Safely extract the text from the API response
                 regex_pattern = gemini_response.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text')
 
                 if not regex_pattern:
                     raise ValueError("LLM did not return a regex pattern.")
-
-                # Clean up potential markdown formatting from the response
+                
                 if regex_pattern.strip().startswith("```"):
                     regex_pattern = re.sub(r'```(python)?\n', '', regex_pattern)
                     regex_pattern = regex_pattern.strip().replace('```', '')
@@ -147,68 +162,62 @@ def upload_file():
                 print("-----------------------------\n")
 
                 pattern = re.compile(regex_pattern.strip())
-                parsed_data = []
-                with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
-                    for line in f:
-                        match = pattern.match(line.strip())
-                        if match:
-                            parsed_data.append(match.groupdict())
-                
+                parsed_data = [match.groupdict() for line in file_content.splitlines() if (match := pattern.match(line.strip()))]
                 df = pd.DataFrame(parsed_data)
 
+                if df.empty:
+                    print("AI regex parsing failed. Using fallback method.")
+                    flash("AI parsing could not find a pattern. Displaying raw lines instead.")
+                    parsed_data = [{'message': line} for line in file_content.splitlines()]
+                    df = pd.DataFrame(parsed_data)
+
             elif ext == 'json':
-                try:
-                    with open(upload_path, 'r', encoding='utf-8') as f:
-                        json_data = json.load(f)
-                    df = pd.json_normalize(json_data)
-                except (json.JSONDecodeError, TypeError):
-                    df = pd.read_json(upload_path, lines=True, encoding='utf-8')
+                df = pd.read_json(upload_path)
 
             elif ext == 'xml':
                 tree = ET.parse(upload_path)
-                root = tree.getroot()
-                records = [{child.tag: child.text for child in elem} for elem in root]
+                records = [{child.tag: child.text for child in elem} for elem in tree.getroot()]
                 df = pd.DataFrame(records)
             
-            # --- NEW: Added direct CSV handling ---
             elif ext == 'csv':
                 df = pd.read_csv(upload_path)
 
-
             if df is None or df.empty:
-                flash('Could not parse the file. The format might be unsupported or the file is empty.')
+                flash('Parsing failed. The application could not structure the data.')
                 return redirect(url_for('index'))
 
-            base_filename = filename.rsplit('.', 1)[0]
-            csv_filename = f"{base_filename}_processed.csv" # Suffix to avoid name collision
-            csv_path = os.path.join(app.config['PROCESSED_FOLDER'], csv_filename)
-            df.to_csv(csv_path, index=False)
+            metadata_cols = [col for col in df.columns if col not in GENERIC_SCHEMA]
+            
+            if metadata_cols:
+                df['metadata'] = df[metadata_cols].apply(
+                    lambda row: row.to_json(), axis=1
+                )
+                df = df.drop(columns=metadata_cols)
+
+            df = df.reindex(columns=GENERIC_SCHEMA)
+
+            csv_string = df.to_csv(index=False)
+            
+            csv_filename = f"{filename.rsplit('.', 1)[0]}_processed.csv"
             
-            preview_df = pd.read_csv(csv_path)
-            preview_limit = 100
-            preview_headers = preview_df.columns.values.tolist()
-            rows = preview_df.head(preview_limit).values.tolist()
+            preview_headers = df.columns.values.tolist()
+            rows = df.head(100).fillna('').values.tolist()
             
             return render_template('results.html', 
                                    csv_filename=csv_filename,
                                    headers=preview_headers,
-                                   rows=rows)
+                                   rows=rows,
+                                   full_csv_data=csv_string)
 
         except Exception as e:
+            print(f"An unexpected error occurred: {e}")
             flash(f'An error occurred: {e}')
             return redirect(url_for('index'))
 
     else:
-        # --- FIXED: Redirect to 'index' on error ---
         flash('File type not allowed.')
         return redirect(url_for('index'))
 
-@app.route('/download/<filename>')
-def download_file(filename):
-    """Serves the processed CSV file for download."""
-    return send_from_directory(app.config['PROCESSED_FOLDER'], filename, as_attachment=True)
-
-# --- Main Execution ---
 if __name__ == '__main__':
     os.makedirs(UPLOAD_FOLDER, exist_ok=True)
     os.makedirs(PROCESSED_FOLDER, exist_ok=True)
diff --git a/templates/results.html b/templates/results.html
index 37ef5e4..784b741 100644
--- a/templates/results.html
+++ b/templates/results.html
@@ -35,9 +35,10 @@ <h1 class="text-2xl md:text-3xl font-bold text-gray-900">Success!</h1>
 
         <!-- Action Buttons -->
         <div class="my-8 flex flex-col sm:flex-row justify-center items-center gap-4">
-            <a href="{{ url_for('download_file', filename=csv_filename) }}" class="w-full sm:w-auto inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
+            <!-- This button now triggers the JavaScript download -->
+            <button id="download-btn" class="w-full sm:w-auto inline-flex justify-center items-center px-6 py-3 border border-transparent text-base font-medium rounded-md text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
                 Download Full CSV
-            </a>
+            </button>
             <a href="{{ url_for('index') }}" class="w-full sm:w-auto inline-flex justify-center items-center px-6 py-3 border border-gray-300 text-base font-medium rounded-md text-gray-700 bg-white hover:bg-gray-50 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
                 Process Another File
             </a>
@@ -61,9 +62,13 @@ <h1 class="text-2xl md:text-3xl font-bold text-gray-900">Success!</h1>
                         {% for row in rows %}
                         <tr class="hover:bg-gray-50">
                             {% for cell in row %}
-                            <!-- Applied consistent borders to all cells for the grid effect -->
                             <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-700 border-b border-r border-gray-200">
-                                {{ cell }}
+                                <!-- UPDATED: Logic to display 'null' for empty cells -->
+                                {% if cell == '' or cell is none %}
+                                    <span class="text-gray-400 italic">null</span>
+                                {% else %}
+                                    {{ cell }}
+                                {% endif %}
                             </td>
                             {% endfor %}
                         </tr>
@@ -72,8 +77,27 @@ <h1 class="text-2xl md:text-3xl font-bold text-gray-900">Success!</h1>
                 </table>
             </div>
         </div>
-
     </div>
+            <script>
+        // Get the full CSV data and filename passed from the Flask template
+        const csvData = {{ full_csv_data | tojson }};
+        const csvFilename = {{ csv_filename | tojson }};
+
+        document.getElementById('download-btn').addEventListener('click', () => {
+            const blob = new Blob([csvData], { type: 'text/csv;charset=utf-8;' });
+            const link = document.createElement('a');
+            const url = URL.createObjectURL(blob);
+            link.setAttribute('href', url);
+            link.setAttribute('download', csvFilename);
+            link.style.visibility = 'hidden';
+            document.body.appendChild(link);
+            link.click();
+            document.body.removeChild(link);
+            URL.revokeObjectURL(url);
+        });
+    </script>
+
+
 
 </body>
 </html>
\ No newline at end of file

From 232a50eca40f100954935d918aed9fef23d5fb84 Mon Sep 17 00:00:00 2001
From: Devanshu-gif <23f2004399@ds.study.iitm.ac.in>
Date: Wed, 6 Aug 2025 06:03:31 +0530
Subject: [PATCH 3/3] some issues resolved

---
 app.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 3d64af1..cafd1ad 100644
--- a/app.py
+++ b/app.py
@@ -71,7 +71,7 @@ def get_regex_from_gemini(log_sample):
     **Your Instructions:**
     1.  Create a single Python regex with named capture groups (e.g., `?P<group_name>...`).
     2.  The name of each capture group **MUST** match a key from the **Generic Log Schema** if possible. Use the following mapping rules:
-        - For a unique identifier (like a UUID, `correlation_id`, `request_id`), name the capture group `trace_id`.
+        - For a unique identifier (like a UUID, `correlation_id`, `request_id`, `tid`, `euid`), name the capture group `trace_id`.
         - For an exception or stack trace (like `exception_details`, `stack_trace`), name the capture group `error_details`.
         - For a service or application name, use `service_name`.
         - For a server or machine name, use `host_name`.
@@ -81,8 +81,12 @@ def get_regex_from_gemini(log_sample):
     6.  The regex should match the entire line from start (`^`) to end (`$`).
     7.  **Return ONLY the raw regex string and absolutely nothing else.**
 
-    **Example Output for a log like "2025-08-05 17:15:21 | WARNING | Profile updated | email-dispatcher | host-02 | a870-d0a6 | TimeoutException | ...":**
+    **Example 1: For a complex, pipe-delimited log like "2025-08-05 17:15:21 | WARNING | Profile updated | email-dispatcher | host-02 | a870-d0a6 | TimeoutException | ...":**
     ^(?P<timestamp>.*?)\\s*\\|\\s*(?P<log_level>\\w+)\\s*\\|\\s*(?P<message>.*?)\\s*\\|\\s*(?P<service_name>.*?)\\s*\\|\\s*(?P<host_name>.*?)\\s*\\|\\s*(?P<trace_id>.*?)\\s*\\|\\s*(?P<error_details>.*?)\\s*\\|\\s*(?P<json_payload>{{.*}})$
+    
+    **Example 2: For a standard log like "[Sun Dec 04 04:51:18 2005] [error] mod_jk child workerEnv in error state 6":**
+    Your output should be a regex like:
+    ^\\[(?P<timestamp>.*?)\\] \\[(?P<log_level>\\w+)\\] (?P<message>.*)$
     """    
     payload = {"contents": [{"parts": [{"text": prompt}]}]}
     headers = {'Content-Type': 'application/json'}