brpuneet898 · 23f2004399 · Jul 29, 2025 · Jul 29, 2025 · Aug 5, 2025 · Aug 5, 2025
diff --git a/app.py b/app.py
@@ -3,77 +3,110 @@
 import pandas as pd
 import xml.etree.ElementTree as ET
 import re
-from flask import Flask, render_template, request, send_from_directory, flash, redirect, url_for
+import requests
+import time
+import io
+import yaml
+from flask import Flask, render_template, request, flash, redirect, url_for
 from werkzeug.utils import secure_filename
 
 # --- Configuration ---
-# Define the paths for file uploads and processed files.
-# It's good practice to use absolute paths.
+# Reading the key from your local key.yaml file
+with open("key.yaml", "r") as f:
+    config = yaml.safe_load(f)
+GEMINI_API_KEY = config.get("GEMINI_API_KEY")
+
+# Using the correct, powerful model for this task
+GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={GEMINI_API_KEY}"
+
+# Using local folders for testing
 UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
 PROCESSED_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'processed')
-ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml'}
+ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml', 'csv'}
+
+# The canonical, final schema for all logs
+GENERIC_SCHEMA = [
+    'timestamp', 'log_level', 'message', 'service_name', 
+    'host_name', 'trace_id', 'error_details', 'metadata'
+]
 
 # --- App Initialization ---
 app = Flask(__name__)
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 app.config['PROCESSED_FOLDER'] = PROCESSED_FOLDER
-app.config['SECRET_KEY'] = 'supersecretkey' # Change this in a real application
+app.config['SECRET_KEY'] = 'a-very-secret-key-for-local-testing'
 
 # --- Helper Functions ---
 
 def allowed_file(filename):
     """Checks if the file's extension is in the ALLOWED_EXTENSIONS set."""
-    return '.' in filename and \
-           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 
-def parse_log_file(file_path):
-    """
-    A generic log parser. This is a basic example.
-    Real-world log files can be very complex. This function tries to extract
-    common patterns like IP addresses, timestamps, and request methods.
-    You will likely need to customize this regex for your specific log formats.
-    """
-    # Example Regex: Captures IP, timestamp, request method/path, status, and size.
-    # This is a common pattern for web server logs (e.g., Apache, Nginx).
-    log_pattern = re.compile(r'(?P<ip>\S+) \S+ \S+ \[(?P<timestamp>.*?)\] "(?P<request>.*?)" (?P<status>\d{3}) (?P<size>\S+)')
-    data = []
-    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-        for line in f:
-            match = log_pattern.match(line)
-            if match:
-                data.append(match.groupdict())
-    if not data:
-        # Fallback for unstructured logs: treat each line as a single message.
-        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
-            data = [{'message': line.strip()} for line in f]
-    return pd.DataFrame(data)
-
-def parse_json_file(file_path):
-    """
-    Parses a JSON file. Handles both standard JSON and line-delimited JSON (JSONL).
-    """
-    try:
-        # Try parsing as a standard JSON array of objects
-        df = pd.read_json(file_path)
-    except ValueError:
-        # If that fails, try parsing as line-delimited JSON
-        with open(file_path, 'r') as f:
-            data = [json.loads(line) for line in f]
-        df = pd.DataFrame(data)
-    return df
-
-def parse_xml_file(file_path):
+def get_regex_from_gemini(log_sample):
     """
-    Parses an XML file. Assumes a structure where the root has many
-    child elements, and each child is a record.
+    Sends a sample of the log file to the Gemini API to determine a parsing regex.
     """
-    tree = ET.parse(file_path)
-    root = tree.getroot()
-    data = []
-    for elem in root:
-        record = {child.tag: child.text for child in elem}
-        data.append(record)
-    return pd.DataFrame(data)
+    if not GEMINI_API_KEY:
+        raise ValueError("GEMINI_API_KEY not found in key.yaml.")
+
+    # --- UPDATED PROMPT with stricter message definition ---
+    prompt = f"""
+    You are an expert log data transformation engine. Your task is to analyze a raw log sample and generate a Python-compatible regular expression (regex) that extracts key information and maps it to a predefined generic schema.
+
+    **Generic Log Schema:**
+    - `timestamp`: The full timestamp of the event.
+    - `log_level`: The severity of the event (e.g., INFO, WARN, ERROR).
+    - `message`: The primary, human-readable message.
+    - `service_name`: The application or service that generated the log.
+    - `host_name`: The hostname of the machine.
+    - `trace_id`: A unique identifier for correlating logs.
+    - `error_details`: Stack trace or detailed error messages.
+    - `metadata`: A catch-all for any other structured data.
+
+    **Log Sample to Analyze:**
+    ---
+    {log_sample}
+    ---
+
+    **Your Instructions:**
+    1.  Create a single Python regex with named capture groups (e.g., `?P<group_name>...`).
+    2.  The name of each capture group **MUST** match a key from the **Generic Log Schema** if possible. Use the following mapping rules:
+        - For a unique identifier (like a UUID, `correlation_id`, `request_id`, `tid`, `euid`), name the capture group `trace_id`.
+        - For an exception or stack trace (like `exception_details`, `stack_trace`), name the capture group `error_details`.
+        - For a service or application name, use `service_name`.
+        - For a server or machine name, use `host_name`.
+    3.  If a field from the log does not logically map to any standard schema key, create a capture group with a descriptive, snake_case name (e.g., `machine_id`, `user_id`). The Python code will handle putting these into the `metadata` field.
+    4.  **CRITICAL RULE:** The `timestamp` group must capture the entire date and time component as one field.
+    5.  **CRITICAL RULE for `message`:** The `message` is the human-readable text that typically follows the timestamp and log level but comes BEFORE any other structured key-value pairs or identifiers.
+    6.  The regex should match the entire line from start (`^`) to end (`$`).
+    7.  **Return ONLY the raw regex string and absolutely nothing else.**
+
+    **Example 1: For a complex, pipe-delimited log like "2025-08-05 17:15:21 | WARNING | Profile updated | email-dispatcher | host-02 | a870-d0a6 | TimeoutException | ...":**
+    ^(?P<timestamp>.*?)\\s*\\|\\s*(?P<log_level>\\w+)\\s*\\|\\s*(?P<message>.*?)\\s*\\|\\s*(?P<service_name>.*?)\\s*\\|\\s*(?P<host_name>.*?)\\s*\\|\\s*(?P<trace_id>.*?)\\s*\\|\\s*(?P<error_details>.*?)\\s*\\|\\s*(?P<json_payload>{{.*}})$
+
+    **Example 2: For a standard log like "[Sun Dec 04 04:51:18 2005] [error] mod_jk child workerEnv in error state 6":**
+    Your output should be a regex like:
+    ^\\[(?P<timestamp>.*?)\\] \\[(?P<log_level>\\w+)\\] (?P<message>.*)$
+    """    
+    payload = {"contents": [{"parts": [{"text": prompt}]}]}
+    headers = {'Content-Type': 'application/json'}
+
+    retries = 3
+    backoff_factor = 2
+    for i in range(retries):
+        try:
+            response = requests.post(GEMINI_API_URL, headers=headers, json=payload, timeout=90)
+            response.raise_for_status()
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            if e.response is not None and e.response.status_code in [503, 429] and i < retries - 1:
+                time.sleep(backoff_factor)
+                backoff_factor *= 2
+                continue
+            else:
+                raise e
+
+    raise requests.exceptions.RequestException("Failed to get a response from the API after several retries.")
 
 # --- Flask Routes ---
 
@@ -87,60 +120,109 @@ def upload_file():
     """Handles the file upload and processing logic."""
     if 'file' not in request.files:
         flash('No file part')
-        return redirect(request.url)
+        return redirect(url_for('index'))
 
     file = request.files['file']
 
     if file.filename == '':
         flash('No selected file')
-        return redirect(request.url)
+        return redirect(url_for('index'))
 
     if file and allowed_file(file.filename):
-        filename = secure_filename(file.filename)
-        upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-        file.save(upload_path)
-
         try:
-            # Determine file type and parse accordingly
+            filename = secure_filename(file.filename)
+            upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+            file.save(upload_path)
+
+            with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
+                file_content = f.read()
+
+            if not file_content:
+                flash("The uploaded file appears to be empty.")
+                return redirect(url_for('index'))
+
             ext = filename.rsplit('.', 1)[1].lower()
             df = None
+
             if ext in ['log', 'txt']:
-                df = parse_log_file(upload_path)
+                log_sample = "\n".join(file_content.splitlines()[:10])
+
+                if not log_sample:
+                    flash("File is empty.")
+                    return redirect(url_for('index'))
+
+                gemini_response = get_regex_from_gemini(log_sample)
+                regex_pattern = gemini_response.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text')
+
+                if not regex_pattern:
+                    raise ValueError("LLM did not return a regex pattern.")
+
+                if regex_pattern.strip().startswith("```"):
+                    regex_pattern = re.sub(r'```(python)?\n', '', regex_pattern)
+                    regex_pattern = regex_pattern.strip().replace('```', '')
+
+                print("\n--- Generated Regex Pattern ---")
+                print(regex_pattern.strip())
+                print("-----------------------------\n")
+
+                pattern = re.compile(regex_pattern.strip())
+                parsed_data = [match.groupdict() for line in file_content.splitlines() if (match := pattern.match(line.strip()))]
+                df = pd.DataFrame(parsed_data)
+
+                if df.empty:
+                    print("AI regex parsing failed. Using fallback method.")
+                    flash("AI parsing could not find a pattern. Displaying raw lines instead.")
+                    parsed_data = [{'message': line} for line in file_content.splitlines()]
+                    df = pd.DataFrame(parsed_data)
+
             elif ext == 'json':
-                df = parse_json_file(upload_path)
+                df = pd.read_json(upload_path)
+
             elif ext == 'xml':
-                df = parse_xml_file(upload_path)
+                tree = ET.parse(upload_path)
+                records = [{child.tag: child.text for child in elem} for elem in tree.getroot()]
+                df = pd.DataFrame(records)
+
+            elif ext == 'csv':
+                df = pd.read_csv(upload_path)
 
             if df is None or df.empty:
-                flash('Could not parse the file. The format might be unsupported or the file is empty.')
+                flash('Parsing failed. The application could not structure the data.')
                 return redirect(url_for('index'))
 
-            # Create the output CSV
-            base_filename = filename.rsplit('.', 1)[0]
-            csv_filename = f"{base_filename}.csv"
-            csv_path = os.path.join(app.config['PROCESSED_FOLDER'], csv_filename)
-            df.to_csv(csv_path, index=False)
+            metadata_cols = [col for col in df.columns if col not in GENERIC_SCHEMA]
 
-            # Redirect to the results page
-            return render_template('results.html', csv_filename=csv_filename)
+            if metadata_cols:
+                df['metadata'] = df[metadata_cols].apply(
+                    lambda row: row.to_json(), axis=1
+                )
+                df = df.drop(columns=metadata_cols)
+
+            df = df.reindex(columns=GENERIC_SCHEMA)
+
+            csv_string = df.to_csv(index=False)
+
+            csv_filename = f"{filename.rsplit('.', 1)[0]}_processed.csv"
+
+            preview_headers = df.columns.values.tolist()
+            rows = df.head(100).fillna('').values.tolist()
+
+            return render_template('results.html', 
+                                   csv_filename=csv_filename,
+                                   headers=preview_headers,
+                                   rows=rows,
+                                   full_csv_data=csv_string)
 
         except Exception as e:
-            flash(f'An error occurred while processing the file: {e}')
+            print(f"An unexpected error occurred: {e}")
+            flash(f'An error occurred: {e}')
             return redirect(url_for('index'))
 
     else:
         flash('File type not allowed.')
-        return redirect(request.url)
-
-@app.route('/download/<filename>')
-def download_file(filename):
-    """Serves the processed CSV file for download."""
-    return send_from_directory(app.config['PROCESSED_FOLDER'], filename, as_attachment=True)
-
+        return redirect(url_for('index'))
 
-# --- Main Execution ---
 if __name__ == '__main__':
-    # Create upload and processed directories if they don't exist
     os.makedirs(UPLOAD_FOLDER, exist_ok=True)
     os.makedirs(PROCESSED_FOLDER, exist_ok=True)
     app.run(debug=True)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 flask
 pandas
-lxmal
+lxmal
+requests
+PyYAML
diff --git a/templates/index.html b/templates/index.html
@@ -1,4 +1,3 @@
-
 <!-- templates/index.html -->
 <!DOCTYPE html>
 <html lang="en">
@@ -38,7 +37,9 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
         <form action="/upload" method="post" enctype="multipart/form-data" class="space-y-6">
             <div>
                 <label for="file" class="block text-sm font-medium text-gray-700">Log File</label>
-                <div class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">
+
+                <!-- This is the drag-and-drop upload box -->
+                <div id="upload-box" class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">
                     <div class="space-y-1 text-center">
                         <svg class="mx-auto h-12 w-12 text-gray-400" stroke="currentColor" fill="none" viewBox="0 0 48 48" aria-hidden="true">
                             <path d="M28 8H12a4 4 0 00-4 4v20m32-12v8m0 0v8a4 4 0 01-4 4H12a4 4 0 01-4-4v-4m32-4l-3.172-3.172a4 4 0 00-5.656 0L28 28M8 32l9.172-9.172a4 4 0 015.656 0L28 28" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" />
@@ -53,6 +54,18 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
                         <p class="text-xs text-gray-500">.log, .txt, .json, .xml up to 50MB</p>
                     </div>
                 </div>
+
+                <!-- This block will be shown after a file is selected -->
+                <div id="file-info" class="hidden mt-2 flex items-center justify-between p-4 border border-gray-300 rounded-md bg-gray-50">
+                    <div class="flex items-center">
+                        <svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6 text-green-500" viewBox="0 0 20 20" fill="currentColor">
+                          <path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
+                        </svg>
+                        <span id="file-name" class="text-sm text-gray-800 ml-3 font-medium"></span>
+                    </div>
+                    <button id="remove-file" type="button" class="text-sm font-medium text-red-600 hover:text-red-800">Remove</button>
+                </div>
+
             </div>
             <div>
                 <button type="submit" class="w-full flex justify-center py-3 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
@@ -62,5 +75,43 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
         </form>
     </div>
 
+    <script>
+        // Get references to the HTML elements
+        const fileUploadInput = document.getElementById('file-upload');
+        const uploadBox = document.getElementById('upload-box');
+        const fileInfo = document.getElementById('file-info');
+        const fileNameSpan = document.getElementById('file-name');
+        const removeFileBtn = document.getElementById('remove-file');
+
+        // Listen for changes on the file input
+        fileUploadInput.addEventListener('change', () => {
+            // Check if any file is selected
+            if (fileUploadInput.files.length > 0) {
+                const fileName = fileUploadInput.files[0].name;
+
+                // Display the selected file's name
+                fileNameSpan.textContent = fileName;
+
+                // Hide the original upload box
+                uploadBox.classList.add('hidden');
+
+                // Show the file info box with the remove button
+                fileInfo.classList.remove('hidden');
+            }
+        });
+
+        // Listen for clicks on the "Remove" button
+        removeFileBtn.addEventListener('click', () => {
+            // Clear the selected file from the input
+            fileUploadInput.value = ''; 
+
+            // Show the original upload box again
+            uploadBox.classList.remove('hidden');
+
+            // Hide the file info box
+            fileInfo.classList.add('hidden');
+        });
+    </script>
+
 </body>
 </html>