Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 163 additions & 81 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,77 +3,110 @@
import pandas as pd
import xml.etree.ElementTree as ET
import re
from flask import Flask, render_template, request, send_from_directory, flash, redirect, url_for
import requests
import time
import io
import yaml
from flask import Flask, render_template, request, flash, redirect, url_for
from werkzeug.utils import secure_filename

# --- Configuration ---
# Define the paths for file uploads and processed files.
# It's good practice to use absolute paths.
# Reading the key from your local key.yaml file
with open("key.yaml", "r") as f:
config = yaml.safe_load(f)
GEMINI_API_KEY = config.get("GEMINI_API_KEY")

# Using the correct, powerful model for this task
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key={GEMINI_API_KEY}"

# Using local folders for testing
UPLOAD_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
PROCESSED_FOLDER = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'processed')
ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml'}
ALLOWED_EXTENSIONS = {'txt', 'log', 'json', 'xml', 'csv'}

# The canonical, final schema for all logs
GENERIC_SCHEMA = [
'timestamp', 'log_level', 'message', 'service_name',
'host_name', 'trace_id', 'error_details', 'metadata'
]

# --- App Initialization ---
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['PROCESSED_FOLDER'] = PROCESSED_FOLDER
app.config['SECRET_KEY'] = 'supersecretkey' # Change this in a real application
app.config['SECRET_KEY'] = 'a-very-secret-key-for-local-testing'

# --- Helper Functions ---

def allowed_file(filename):
"""Checks if the file's extension is in the ALLOWED_EXTENSIONS set."""
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def parse_log_file(file_path):
"""
A generic log parser. This is a basic example.
Real-world log files can be very complex. This function tries to extract
common patterns like IP addresses, timestamps, and request methods.
You will likely need to customize this regex for your specific log formats.
"""
# Example Regex: Captures IP, timestamp, request method/path, status, and size.
# This is a common pattern for web server logs (e.g., Apache, Nginx).
log_pattern = re.compile(r'(?P<ip>\S+) \S+ \S+ \[(?P<timestamp>.*?)\] "(?P<request>.*?)" (?P<status>\d{3}) (?P<size>\S+)')
data = []
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
match = log_pattern.match(line)
if match:
data.append(match.groupdict())
if not data:
# Fallback for unstructured logs: treat each line as a single message.
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
data = [{'message': line.strip()} for line in f]
return pd.DataFrame(data)

def parse_json_file(file_path):
"""
Parses a JSON file. Handles both standard JSON and line-delimited JSON (JSONL).
"""
try:
# Try parsing as a standard JSON array of objects
df = pd.read_json(file_path)
except ValueError:
# If that fails, try parsing as line-delimited JSON
with open(file_path, 'r') as f:
data = [json.loads(line) for line in f]
df = pd.DataFrame(data)
return df

def parse_xml_file(file_path):
def get_regex_from_gemini(log_sample):
"""
Parses an XML file. Assumes a structure where the root has many
child elements, and each child is a record.
Sends a sample of the log file to the Gemini API to determine a parsing regex.
"""
tree = ET.parse(file_path)
root = tree.getroot()
data = []
for elem in root:
record = {child.tag: child.text for child in elem}
data.append(record)
return pd.DataFrame(data)
if not GEMINI_API_KEY:
raise ValueError("GEMINI_API_KEY not found in key.yaml.")

# --- UPDATED PROMPT with stricter message definition ---
prompt = f"""
You are an expert log data transformation engine. Your task is to analyze a raw log sample and generate a Python-compatible regular expression (regex) that extracts key information and maps it to a predefined generic schema.

**Generic Log Schema:**
- `timestamp`: The full timestamp of the event.
- `log_level`: The severity of the event (e.g., INFO, WARN, ERROR).
- `message`: The primary, human-readable message.
- `service_name`: The application or service that generated the log.
- `host_name`: The hostname of the machine.
- `trace_id`: A unique identifier for correlating logs.
- `error_details`: Stack trace or detailed error messages.
- `metadata`: A catch-all for any other structured data.

**Log Sample to Analyze:**
---
{log_sample}
---

**Your Instructions:**
1. Create a single Python regex with named capture groups (e.g., `?P<group_name>...`).
2. The name of each capture group **MUST** match a key from the **Generic Log Schema** if possible. Use the following mapping rules:
- For a unique identifier (like a UUID, `correlation_id`, `request_id`, `tid`, `euid`), name the capture group `trace_id`.
- For an exception or stack trace (like `exception_details`, `stack_trace`), name the capture group `error_details`.
- For a service or application name, use `service_name`.
- For a server or machine name, use `host_name`.
3. If a field from the log does not logically map to any standard schema key, create a capture group with a descriptive, snake_case name (e.g., `machine_id`, `user_id`). The Python code will handle putting these into the `metadata` field.
4. **CRITICAL RULE:** The `timestamp` group must capture the entire date and time component as one field.
5. **CRITICAL RULE for `message`:** The `message` is the human-readable text that typically follows the timestamp and log level but comes BEFORE any other structured key-value pairs or identifiers.
6. The regex should match the entire line from start (`^`) to end (`$`).
7. **Return ONLY the raw regex string and absolutely nothing else.**

**Example 1: For a complex, pipe-delimited log like "2025-08-05 17:15:21 | WARNING | Profile updated | email-dispatcher | host-02 | a870-d0a6 | TimeoutException | ...":**
^(?P<timestamp>.*?)\\s*\\|\\s*(?P<log_level>\\w+)\\s*\\|\\s*(?P<message>.*?)\\s*\\|\\s*(?P<service_name>.*?)\\s*\\|\\s*(?P<host_name>.*?)\\s*\\|\\s*(?P<trace_id>.*?)\\s*\\|\\s*(?P<error_details>.*?)\\s*\\|\\s*(?P<json_payload>{{.*}})$

**Example 2: For a standard log like "[Sun Dec 04 04:51:18 2005] [error] mod_jk child workerEnv in error state 6":**
Your output should be a regex like:
^\\[(?P<timestamp>.*?)\\] \\[(?P<log_level>\\w+)\\] (?P<message>.*)$
"""
payload = {"contents": [{"parts": [{"text": prompt}]}]}
headers = {'Content-Type': 'application/json'}

retries = 3
backoff_factor = 2
for i in range(retries):
try:
response = requests.post(GEMINI_API_URL, headers=headers, json=payload, timeout=90)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
if e.response is not None and e.response.status_code in [503, 429] and i < retries - 1:
time.sleep(backoff_factor)
backoff_factor *= 2
continue
else:
raise e

raise requests.exceptions.RequestException("Failed to get a response from the API after several retries.")

# --- Flask Routes ---

Expand All @@ -87,60 +120,109 @@ def upload_file():
"""Handles the file upload and processing logic."""
if 'file' not in request.files:
flash('No file part')
return redirect(request.url)
return redirect(url_for('index'))

file = request.files['file']

if file.filename == '':
flash('No selected file')
return redirect(request.url)
return redirect(url_for('index'))

if file and allowed_file(file.filename):
filename = secure_filename(file.filename)
upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(upload_path)

try:
# Determine file type and parse accordingly
filename = secure_filename(file.filename)
upload_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(upload_path)

with open(upload_path, 'r', encoding='utf-8', errors='ignore') as f:
file_content = f.read()

if not file_content:
flash("The uploaded file appears to be empty.")
return redirect(url_for('index'))

ext = filename.rsplit('.', 1)[1].lower()
df = None

if ext in ['log', 'txt']:
df = parse_log_file(upload_path)
log_sample = "\n".join(file_content.splitlines()[:10])

if not log_sample:
flash("File is empty.")
return redirect(url_for('index'))

gemini_response = get_regex_from_gemini(log_sample)
regex_pattern = gemini_response.get('candidates', [{}])[0].get('content', {}).get('parts', [{}])[0].get('text')

if not regex_pattern:
raise ValueError("LLM did not return a regex pattern.")

if regex_pattern.strip().startswith("```"):
regex_pattern = re.sub(r'```(python)?\n', '', regex_pattern)
regex_pattern = regex_pattern.strip().replace('```', '')

print("\n--- Generated Regex Pattern ---")
print(regex_pattern.strip())
print("-----------------------------\n")

pattern = re.compile(regex_pattern.strip())
parsed_data = [match.groupdict() for line in file_content.splitlines() if (match := pattern.match(line.strip()))]
df = pd.DataFrame(parsed_data)

if df.empty:
print("AI regex parsing failed. Using fallback method.")
flash("AI parsing could not find a pattern. Displaying raw lines instead.")
parsed_data = [{'message': line} for line in file_content.splitlines()]
df = pd.DataFrame(parsed_data)

elif ext == 'json':
df = parse_json_file(upload_path)
df = pd.read_json(upload_path)

elif ext == 'xml':
df = parse_xml_file(upload_path)
tree = ET.parse(upload_path)
records = [{child.tag: child.text for child in elem} for elem in tree.getroot()]
df = pd.DataFrame(records)

elif ext == 'csv':
df = pd.read_csv(upload_path)

if df is None or df.empty:
flash('Could not parse the file. The format might be unsupported or the file is empty.')
flash('Parsing failed. The application could not structure the data.')
return redirect(url_for('index'))

# Create the output CSV
base_filename = filename.rsplit('.', 1)[0]
csv_filename = f"{base_filename}.csv"
csv_path = os.path.join(app.config['PROCESSED_FOLDER'], csv_filename)
df.to_csv(csv_path, index=False)
metadata_cols = [col for col in df.columns if col not in GENERIC_SCHEMA]

# Redirect to the results page
return render_template('results.html', csv_filename=csv_filename)
if metadata_cols:
df['metadata'] = df[metadata_cols].apply(
lambda row: row.to_json(), axis=1
)
df = df.drop(columns=metadata_cols)

df = df.reindex(columns=GENERIC_SCHEMA)

csv_string = df.to_csv(index=False)

csv_filename = f"{filename.rsplit('.', 1)[0]}_processed.csv"

preview_headers = df.columns.values.tolist()
rows = df.head(100).fillna('').values.tolist()

return render_template('results.html',
csv_filename=csv_filename,
headers=preview_headers,
rows=rows,
full_csv_data=csv_string)

except Exception as e:
flash(f'An error occurred while processing the file: {e}')
print(f"An unexpected error occurred: {e}")
flash(f'An error occurred: {e}')
return redirect(url_for('index'))

else:
flash('File type not allowed.')
return redirect(request.url)

@app.route('/download/<filename>')
def download_file(filename):
"""Serves the processed CSV file for download."""
return send_from_directory(app.config['PROCESSED_FOLDER'], filename, as_attachment=True)

return redirect(url_for('index'))

# --- Main Execution ---
if __name__ == '__main__':
# Create upload and processed directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(PROCESSED_FOLDER, exist_ok=True)
app.run(debug=True)
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
flask
pandas
lxmal
lxmal
requests
PyYAML
55 changes: 53 additions & 2 deletions templates/index.html
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

<!-- templates/index.html -->
<!DOCTYPE html>
<html lang="en">
Expand Down Expand Up @@ -38,7 +37,9 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
<form action="/upload" method="post" enctype="multipart/form-data" class="space-y-6">
<div>
<label for="file" class="block text-sm font-medium text-gray-700">Log File</label>
<div class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">

<!-- This is the drag-and-drop upload box -->
<div id="upload-box" class="mt-2 flex justify-center px-6 pt-5 pb-6 border-2 border-gray-300 border-dashed rounded-md">
<div class="space-y-1 text-center">
<svg class="mx-auto h-12 w-12 text-gray-400" stroke="currentColor" fill="none" viewBox="0 0 48 48" aria-hidden="true">
<path d="M28 8H12a4 4 0 00-4 4v20m32-12v8m0 0v8a4 4 0 01-4 4H12a4 4 0 01-4-4v-4m32-4l-3.172-3.172a4 4 0 00-5.656 0L28 28M8 32l9.172-9.172a4 4 0 015.656 0L28 28" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" />
Expand All @@ -53,6 +54,18 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
<p class="text-xs text-gray-500">.log, .txt, .json, .xml up to 50MB</p>
</div>
</div>

<!-- This block will be shown after a file is selected -->
<div id="file-info" class="hidden mt-2 flex items-center justify-between p-4 border border-gray-300 rounded-md bg-gray-50">
<div class="flex items-center">
<svg xmlns="http://www.w3.org/2000/svg" class="h-6 w-6 text-green-500" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
</svg>
<span id="file-name" class="text-sm text-gray-800 ml-3 font-medium"></span>
</div>
<button id="remove-file" type="button" class="text-sm font-medium text-red-600 hover:text-red-800">Remove</button>
</div>

</div>
<div>
<button type="submit" class="w-full flex justify-center py-3 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500">
Expand All @@ -62,5 +75,43 @@ <h1 class="text-3xl md:text-4xl font-bold text-gray-900">Log Data Processor</h1>
</form>
</div>

<script>
// Get references to the HTML elements
const fileUploadInput = document.getElementById('file-upload');
const uploadBox = document.getElementById('upload-box');
const fileInfo = document.getElementById('file-info');
const fileNameSpan = document.getElementById('file-name');
const removeFileBtn = document.getElementById('remove-file');

// Listen for changes on the file input
fileUploadInput.addEventListener('change', () => {
// Check if any file is selected
if (fileUploadInput.files.length > 0) {
const fileName = fileUploadInput.files[0].name;

// Display the selected file's name
fileNameSpan.textContent = fileName;

// Hide the original upload box
uploadBox.classList.add('hidden');

// Show the file info box with the remove button
fileInfo.classList.remove('hidden');
}
});

// Listen for clicks on the "Remove" button
removeFileBtn.addEventListener('click', () => {
// Clear the selected file from the input
fileUploadInput.value = '';

// Show the original upload box again
uploadBox.classList.remove('hidden');

// Hide the file info box
fileInfo.classList.add('hidden');
});
</script>

</body>
</html>
Loading